xref: /linux/net/ipv4/tcp_ipv4.c (revision 791d3ef2e11100449837dc0b6fe884e60ca3a484)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #include <trace/events/tcp.h>
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 EXPORT_SYMBOL(tcp_hashinfo);
97 
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 	struct tcp_sock *tp = tcp_sk(sk);
116 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
117 
118 	if (reuse == 2) {
119 		/* Still does not detect *everything* that goes through
120 		 * lo, since we require a loopback src or dst address
121 		 * or direct binding to 'lo' interface.
122 		 */
123 		bool loopback = false;
124 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125 			loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127 		if (tw->tw_family == AF_INET6) {
128 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 			    (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
130 			     (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
131 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
132 			    (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
133 			     (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
134 				loopback = true;
135 		} else
136 #endif
137 		{
138 			if (ipv4_is_loopback(tw->tw_daddr) ||
139 			    ipv4_is_loopback(tw->tw_rcv_saddr))
140 				loopback = true;
141 		}
142 		if (!loopback)
143 			reuse = 0;
144 	}
145 
146 	/* With PAWS, it is safe from the viewpoint
147 	   of data integrity. Even without PAWS it is safe provided sequence
148 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
149 
150 	   Actually, the idea is close to VJ's one, only timestamp cache is
151 	   held not per host, but per port pair and TW bucket is used as state
152 	   holder.
153 
154 	   If TW bucket has been already destroyed we fall back to VJ's scheme
155 	   and use initial timestamp retrieved from peer table.
156 	 */
157 	if (tcptw->tw_ts_recent_stamp &&
158 	    (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
159 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
160 		if (tp->write_seq == 0)
161 			tp->write_seq = 1;
162 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
163 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
164 		sock_hold(sktw);
165 		return 1;
166 	}
167 
168 	return 0;
169 }
170 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
171 
172 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
173 			      int addr_len)
174 {
175 	/* This check is replicated from tcp_v4_connect() and intended to
176 	 * prevent BPF program called below from accessing bytes that are out
177 	 * of the bound specified by user in addr_len.
178 	 */
179 	if (addr_len < sizeof(struct sockaddr_in))
180 		return -EINVAL;
181 
182 	sock_owned_by_me(sk);
183 
184 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
185 }
186 
187 /* This will initiate an outgoing connection. */
188 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
189 {
190 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
191 	struct inet_sock *inet = inet_sk(sk);
192 	struct tcp_sock *tp = tcp_sk(sk);
193 	__be16 orig_sport, orig_dport;
194 	__be32 daddr, nexthop;
195 	struct flowi4 *fl4;
196 	struct rtable *rt;
197 	int err;
198 	struct ip_options_rcu *inet_opt;
199 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
200 
201 	if (addr_len < sizeof(struct sockaddr_in))
202 		return -EINVAL;
203 
204 	if (usin->sin_family != AF_INET)
205 		return -EAFNOSUPPORT;
206 
207 	nexthop = daddr = usin->sin_addr.s_addr;
208 	inet_opt = rcu_dereference_protected(inet->inet_opt,
209 					     lockdep_sock_is_held(sk));
210 	if (inet_opt && inet_opt->opt.srr) {
211 		if (!daddr)
212 			return -EINVAL;
213 		nexthop = inet_opt->opt.faddr;
214 	}
215 
216 	orig_sport = inet->inet_sport;
217 	orig_dport = usin->sin_port;
218 	fl4 = &inet->cork.fl.u.ip4;
219 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
220 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
221 			      IPPROTO_TCP,
222 			      orig_sport, orig_dport, sk);
223 	if (IS_ERR(rt)) {
224 		err = PTR_ERR(rt);
225 		if (err == -ENETUNREACH)
226 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
227 		return err;
228 	}
229 
230 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
231 		ip_rt_put(rt);
232 		return -ENETUNREACH;
233 	}
234 
235 	if (!inet_opt || !inet_opt->opt.srr)
236 		daddr = fl4->daddr;
237 
238 	if (!inet->inet_saddr)
239 		inet->inet_saddr = fl4->saddr;
240 	sk_rcv_saddr_set(sk, inet->inet_saddr);
241 
242 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
243 		/* Reset inherited state */
244 		tp->rx_opt.ts_recent	   = 0;
245 		tp->rx_opt.ts_recent_stamp = 0;
246 		if (likely(!tp->repair))
247 			tp->write_seq	   = 0;
248 	}
249 
250 	inet->inet_dport = usin->sin_port;
251 	sk_daddr_set(sk, daddr);
252 
253 	inet_csk(sk)->icsk_ext_hdr_len = 0;
254 	if (inet_opt)
255 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
256 
257 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
258 
259 	/* Socket identity is still unknown (sport may be zero).
260 	 * However we set state to SYN-SENT and not releasing socket
261 	 * lock select source port, enter ourselves into the hash tables and
262 	 * complete initialization after this.
263 	 */
264 	tcp_set_state(sk, TCP_SYN_SENT);
265 	err = inet_hash_connect(tcp_death_row, sk);
266 	if (err)
267 		goto failure;
268 
269 	sk_set_txhash(sk);
270 
271 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
272 			       inet->inet_sport, inet->inet_dport, sk);
273 	if (IS_ERR(rt)) {
274 		err = PTR_ERR(rt);
275 		rt = NULL;
276 		goto failure;
277 	}
278 	/* OK, now commit destination to socket.  */
279 	sk->sk_gso_type = SKB_GSO_TCPV4;
280 	sk_setup_caps(sk, &rt->dst);
281 	rt = NULL;
282 
283 	if (likely(!tp->repair)) {
284 		if (!tp->write_seq)
285 			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
286 						       inet->inet_daddr,
287 						       inet->inet_sport,
288 						       usin->sin_port);
289 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
290 						 inet->inet_saddr,
291 						 inet->inet_daddr);
292 	}
293 
294 	inet->inet_id = tp->write_seq ^ jiffies;
295 
296 	if (tcp_fastopen_defer_connect(sk, &err))
297 		return err;
298 	if (err)
299 		goto failure;
300 
301 	err = tcp_connect(sk);
302 
303 	if (err)
304 		goto failure;
305 
306 	return 0;
307 
308 failure:
309 	/*
310 	 * This unhashes the socket and releases the local port,
311 	 * if necessary.
312 	 */
313 	tcp_set_state(sk, TCP_CLOSE);
314 	ip_rt_put(rt);
315 	sk->sk_route_caps = 0;
316 	inet->inet_dport = 0;
317 	return err;
318 }
319 EXPORT_SYMBOL(tcp_v4_connect);
320 
321 /*
322  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
323  * It can be called through tcp_release_cb() if socket was owned by user
324  * at the time tcp_v4_err() was called to handle ICMP message.
325  */
326 void tcp_v4_mtu_reduced(struct sock *sk)
327 {
328 	struct inet_sock *inet = inet_sk(sk);
329 	struct dst_entry *dst;
330 	u32 mtu;
331 
332 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
333 		return;
334 	mtu = tcp_sk(sk)->mtu_info;
335 	dst = inet_csk_update_pmtu(sk, mtu);
336 	if (!dst)
337 		return;
338 
339 	/* Something is about to be wrong... Remember soft error
340 	 * for the case, if this connection will not able to recover.
341 	 */
342 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
343 		sk->sk_err_soft = EMSGSIZE;
344 
345 	mtu = dst_mtu(dst);
346 
347 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
348 	    ip_sk_accept_pmtu(sk) &&
349 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
350 		tcp_sync_mss(sk, mtu);
351 
352 		/* Resend the TCP packet because it's
353 		 * clear that the old packet has been
354 		 * dropped. This is the new "fast" path mtu
355 		 * discovery.
356 		 */
357 		tcp_simple_retransmit(sk);
358 	} /* else let the usual retransmit timer handle it */
359 }
360 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
361 
362 static void do_redirect(struct sk_buff *skb, struct sock *sk)
363 {
364 	struct dst_entry *dst = __sk_dst_check(sk, 0);
365 
366 	if (dst)
367 		dst->ops->redirect(dst, sk, skb);
368 }
369 
370 
371 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
372 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
373 {
374 	struct request_sock *req = inet_reqsk(sk);
375 	struct net *net = sock_net(sk);
376 
377 	/* ICMPs are not backlogged, hence we cannot get
378 	 * an established socket here.
379 	 */
380 	if (seq != tcp_rsk(req)->snt_isn) {
381 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
382 	} else if (abort) {
383 		/*
384 		 * Still in SYN_RECV, just remove it silently.
385 		 * There is no good way to pass the error to the newly
386 		 * created socket, and POSIX does not want network
387 		 * errors returned from accept().
388 		 */
389 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
390 		tcp_listendrop(req->rsk_listener);
391 	}
392 	reqsk_put(req);
393 }
394 EXPORT_SYMBOL(tcp_req_err);
395 
396 /*
397  * This routine is called by the ICMP module when it gets some
398  * sort of error condition.  If err < 0 then the socket should
399  * be closed and the error returned to the user.  If err > 0
400  * it's just the icmp type << 8 | icmp code.  After adjustment
401  * header points to the first 8 bytes of the tcp header.  We need
402  * to find the appropriate port.
403  *
404  * The locking strategy used here is very "optimistic". When
405  * someone else accesses the socket the ICMP is just dropped
406  * and for some paths there is no check at all.
407  * A more general error queue to queue errors for later handling
408  * is probably better.
409  *
410  */
411 
412 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
413 {
414 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
415 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
416 	struct inet_connection_sock *icsk;
417 	struct tcp_sock *tp;
418 	struct inet_sock *inet;
419 	const int type = icmp_hdr(icmp_skb)->type;
420 	const int code = icmp_hdr(icmp_skb)->code;
421 	struct sock *sk;
422 	struct sk_buff *skb;
423 	struct request_sock *fastopen;
424 	u32 seq, snd_una;
425 	s32 remaining;
426 	u32 delta_us;
427 	int err;
428 	struct net *net = dev_net(icmp_skb->dev);
429 
430 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
431 				       th->dest, iph->saddr, ntohs(th->source),
432 				       inet_iif(icmp_skb), 0);
433 	if (!sk) {
434 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
435 		return;
436 	}
437 	if (sk->sk_state == TCP_TIME_WAIT) {
438 		inet_twsk_put(inet_twsk(sk));
439 		return;
440 	}
441 	seq = ntohl(th->seq);
442 	if (sk->sk_state == TCP_NEW_SYN_RECV)
443 		return tcp_req_err(sk, seq,
444 				  type == ICMP_PARAMETERPROB ||
445 				  type == ICMP_TIME_EXCEEDED ||
446 				  (type == ICMP_DEST_UNREACH &&
447 				   (code == ICMP_NET_UNREACH ||
448 				    code == ICMP_HOST_UNREACH)));
449 
450 	bh_lock_sock(sk);
451 	/* If too many ICMPs get dropped on busy
452 	 * servers this needs to be solved differently.
453 	 * We do take care of PMTU discovery (RFC1191) special case :
454 	 * we can receive locally generated ICMP messages while socket is held.
455 	 */
456 	if (sock_owned_by_user(sk)) {
457 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
458 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
459 	}
460 	if (sk->sk_state == TCP_CLOSE)
461 		goto out;
462 
463 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
464 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
465 		goto out;
466 	}
467 
468 	icsk = inet_csk(sk);
469 	tp = tcp_sk(sk);
470 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
471 	fastopen = tp->fastopen_rsk;
472 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
473 	if (sk->sk_state != TCP_LISTEN &&
474 	    !between(seq, snd_una, tp->snd_nxt)) {
475 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
476 		goto out;
477 	}
478 
479 	switch (type) {
480 	case ICMP_REDIRECT:
481 		if (!sock_owned_by_user(sk))
482 			do_redirect(icmp_skb, sk);
483 		goto out;
484 	case ICMP_SOURCE_QUENCH:
485 		/* Just silently ignore these. */
486 		goto out;
487 	case ICMP_PARAMETERPROB:
488 		err = EPROTO;
489 		break;
490 	case ICMP_DEST_UNREACH:
491 		if (code > NR_ICMP_UNREACH)
492 			goto out;
493 
494 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
495 			/* We are not interested in TCP_LISTEN and open_requests
496 			 * (SYN-ACKs send out by Linux are always <576bytes so
497 			 * they should go through unfragmented).
498 			 */
499 			if (sk->sk_state == TCP_LISTEN)
500 				goto out;
501 
502 			tp->mtu_info = info;
503 			if (!sock_owned_by_user(sk)) {
504 				tcp_v4_mtu_reduced(sk);
505 			} else {
506 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
507 					sock_hold(sk);
508 			}
509 			goto out;
510 		}
511 
512 		err = icmp_err_convert[code].errno;
513 		/* check if icmp_skb allows revert of backoff
514 		 * (see draft-zimmermann-tcp-lcd) */
515 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
516 			break;
517 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
518 		    !icsk->icsk_backoff || fastopen)
519 			break;
520 
521 		if (sock_owned_by_user(sk))
522 			break;
523 
524 		icsk->icsk_backoff--;
525 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
526 					       TCP_TIMEOUT_INIT;
527 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
528 
529 		skb = tcp_rtx_queue_head(sk);
530 		BUG_ON(!skb);
531 
532 		tcp_mstamp_refresh(tp);
533 		delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
534 		remaining = icsk->icsk_rto -
535 			    usecs_to_jiffies(delta_us);
536 
537 		if (remaining > 0) {
538 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
539 						  remaining, TCP_RTO_MAX);
540 		} else {
541 			/* RTO revert clocked out retransmission.
542 			 * Will retransmit now */
543 			tcp_retransmit_timer(sk);
544 		}
545 
546 		break;
547 	case ICMP_TIME_EXCEEDED:
548 		err = EHOSTUNREACH;
549 		break;
550 	default:
551 		goto out;
552 	}
553 
554 	switch (sk->sk_state) {
555 	case TCP_SYN_SENT:
556 	case TCP_SYN_RECV:
557 		/* Only in fast or simultaneous open. If a fast open socket is
558 		 * is already accepted it is treated as a connected one below.
559 		 */
560 		if (fastopen && !fastopen->sk)
561 			break;
562 
563 		if (!sock_owned_by_user(sk)) {
564 			sk->sk_err = err;
565 
566 			sk->sk_error_report(sk);
567 
568 			tcp_done(sk);
569 		} else {
570 			sk->sk_err_soft = err;
571 		}
572 		goto out;
573 	}
574 
575 	/* If we've already connected we will keep trying
576 	 * until we time out, or the user gives up.
577 	 *
578 	 * rfc1122 4.2.3.9 allows to consider as hard errors
579 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
580 	 * but it is obsoleted by pmtu discovery).
581 	 *
582 	 * Note, that in modern internet, where routing is unreliable
583 	 * and in each dark corner broken firewalls sit, sending random
584 	 * errors ordered by their masters even this two messages finally lose
585 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
586 	 *
587 	 * Now we are in compliance with RFCs.
588 	 *							--ANK (980905)
589 	 */
590 
591 	inet = inet_sk(sk);
592 	if (!sock_owned_by_user(sk) && inet->recverr) {
593 		sk->sk_err = err;
594 		sk->sk_error_report(sk);
595 	} else	{ /* Only an error on timeout */
596 		sk->sk_err_soft = err;
597 	}
598 
599 out:
600 	bh_unlock_sock(sk);
601 	sock_put(sk);
602 }
603 
604 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
605 {
606 	struct tcphdr *th = tcp_hdr(skb);
607 
608 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
609 	skb->csum_start = skb_transport_header(skb) - skb->head;
610 	skb->csum_offset = offsetof(struct tcphdr, check);
611 }
612 
613 /* This routine computes an IPv4 TCP checksum. */
614 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
615 {
616 	const struct inet_sock *inet = inet_sk(sk);
617 
618 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
619 }
620 EXPORT_SYMBOL(tcp_v4_send_check);
621 
622 /*
623  *	This routine will send an RST to the other tcp.
624  *
625  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
626  *		      for reset.
627  *	Answer: if a packet caused RST, it is not for a socket
628  *		existing in our system, if it is matched to a socket,
629  *		it is just duplicate segment or bug in other side's TCP.
630  *		So that we build reply only basing on parameters
631  *		arrived with segment.
632  *	Exception: precedence violation. We do not implement it in any case.
633  */
634 
635 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
636 {
637 	const struct tcphdr *th = tcp_hdr(skb);
638 	struct {
639 		struct tcphdr th;
640 #ifdef CONFIG_TCP_MD5SIG
641 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
642 #endif
643 	} rep;
644 	struct ip_reply_arg arg;
645 #ifdef CONFIG_TCP_MD5SIG
646 	struct tcp_md5sig_key *key = NULL;
647 	const __u8 *hash_location = NULL;
648 	unsigned char newhash[16];
649 	int genhash;
650 	struct sock *sk1 = NULL;
651 #endif
652 	struct net *net;
653 	struct sock *ctl_sk;
654 
655 	/* Never send a reset in response to a reset. */
656 	if (th->rst)
657 		return;
658 
659 	/* If sk not NULL, it means we did a successful lookup and incoming
660 	 * route had to be correct. prequeue might have dropped our dst.
661 	 */
662 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
663 		return;
664 
665 	/* Swap the send and the receive. */
666 	memset(&rep, 0, sizeof(rep));
667 	rep.th.dest   = th->source;
668 	rep.th.source = th->dest;
669 	rep.th.doff   = sizeof(struct tcphdr) / 4;
670 	rep.th.rst    = 1;
671 
672 	if (th->ack) {
673 		rep.th.seq = th->ack_seq;
674 	} else {
675 		rep.th.ack = 1;
676 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
677 				       skb->len - (th->doff << 2));
678 	}
679 
680 	memset(&arg, 0, sizeof(arg));
681 	arg.iov[0].iov_base = (unsigned char *)&rep;
682 	arg.iov[0].iov_len  = sizeof(rep.th);
683 
684 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
685 #ifdef CONFIG_TCP_MD5SIG
686 	rcu_read_lock();
687 	hash_location = tcp_parse_md5sig_option(th);
688 	if (sk && sk_fullsock(sk)) {
689 		key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
690 					&ip_hdr(skb)->saddr, AF_INET);
691 	} else if (hash_location) {
692 		/*
693 		 * active side is lost. Try to find listening socket through
694 		 * source port, and then find md5 key through listening socket.
695 		 * we are not loose security here:
696 		 * Incoming packet is checked with md5 hash with finding key,
697 		 * no RST generated if md5 hash doesn't match.
698 		 */
699 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
700 					     ip_hdr(skb)->saddr,
701 					     th->source, ip_hdr(skb)->daddr,
702 					     ntohs(th->source), inet_iif(skb),
703 					     tcp_v4_sdif(skb));
704 		/* don't send rst if it can't find key */
705 		if (!sk1)
706 			goto out;
707 
708 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
709 					&ip_hdr(skb)->saddr, AF_INET);
710 		if (!key)
711 			goto out;
712 
713 
714 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
715 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
716 			goto out;
717 
718 	}
719 
720 	if (key) {
721 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
722 				   (TCPOPT_NOP << 16) |
723 				   (TCPOPT_MD5SIG << 8) |
724 				   TCPOLEN_MD5SIG);
725 		/* Update length and the length the header thinks exists */
726 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
727 		rep.th.doff = arg.iov[0].iov_len / 4;
728 
729 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
730 				     key, ip_hdr(skb)->saddr,
731 				     ip_hdr(skb)->daddr, &rep.th);
732 	}
733 #endif
734 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
735 				      ip_hdr(skb)->saddr, /* XXX */
736 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
737 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
738 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
739 
740 	/* When socket is gone, all binding information is lost.
741 	 * routing might fail in this case. No choice here, if we choose to force
742 	 * input interface, we will misroute in case of asymmetric route.
743 	 */
744 	if (sk) {
745 		arg.bound_dev_if = sk->sk_bound_dev_if;
746 		if (sk_fullsock(sk))
747 			trace_tcp_send_reset(sk, skb);
748 	}
749 
750 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
751 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
752 
753 	arg.tos = ip_hdr(skb)->tos;
754 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
755 	local_bh_disable();
756 	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
757 	if (sk)
758 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
759 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
760 	ip_send_unicast_reply(ctl_sk,
761 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
762 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
763 			      &arg, arg.iov[0].iov_len);
764 
765 	ctl_sk->sk_mark = 0;
766 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
767 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
768 	local_bh_enable();
769 
770 #ifdef CONFIG_TCP_MD5SIG
771 out:
772 	rcu_read_unlock();
773 #endif
774 }
775 
776 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
777    outside socket context is ugly, certainly. What can I do?
778  */
779 
780 static void tcp_v4_send_ack(const struct sock *sk,
781 			    struct sk_buff *skb, u32 seq, u32 ack,
782 			    u32 win, u32 tsval, u32 tsecr, int oif,
783 			    struct tcp_md5sig_key *key,
784 			    int reply_flags, u8 tos)
785 {
786 	const struct tcphdr *th = tcp_hdr(skb);
787 	struct {
788 		struct tcphdr th;
789 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
790 #ifdef CONFIG_TCP_MD5SIG
791 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
792 #endif
793 			];
794 	} rep;
795 	struct net *net = sock_net(sk);
796 	struct ip_reply_arg arg;
797 	struct sock *ctl_sk;
798 
799 	memset(&rep.th, 0, sizeof(struct tcphdr));
800 	memset(&arg, 0, sizeof(arg));
801 
802 	arg.iov[0].iov_base = (unsigned char *)&rep;
803 	arg.iov[0].iov_len  = sizeof(rep.th);
804 	if (tsecr) {
805 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
806 				   (TCPOPT_TIMESTAMP << 8) |
807 				   TCPOLEN_TIMESTAMP);
808 		rep.opt[1] = htonl(tsval);
809 		rep.opt[2] = htonl(tsecr);
810 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
811 	}
812 
813 	/* Swap the send and the receive. */
814 	rep.th.dest    = th->source;
815 	rep.th.source  = th->dest;
816 	rep.th.doff    = arg.iov[0].iov_len / 4;
817 	rep.th.seq     = htonl(seq);
818 	rep.th.ack_seq = htonl(ack);
819 	rep.th.ack     = 1;
820 	rep.th.window  = htons(win);
821 
822 #ifdef CONFIG_TCP_MD5SIG
823 	if (key) {
824 		int offset = (tsecr) ? 3 : 0;
825 
826 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
827 					  (TCPOPT_NOP << 16) |
828 					  (TCPOPT_MD5SIG << 8) |
829 					  TCPOLEN_MD5SIG);
830 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
831 		rep.th.doff = arg.iov[0].iov_len/4;
832 
833 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
834 				    key, ip_hdr(skb)->saddr,
835 				    ip_hdr(skb)->daddr, &rep.th);
836 	}
837 #endif
838 	arg.flags = reply_flags;
839 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
840 				      ip_hdr(skb)->saddr, /* XXX */
841 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
842 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
843 	if (oif)
844 		arg.bound_dev_if = oif;
845 	arg.tos = tos;
846 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
847 	local_bh_disable();
848 	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
849 	if (sk)
850 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
851 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
852 	ip_send_unicast_reply(ctl_sk,
853 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
854 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
855 			      &arg, arg.iov[0].iov_len);
856 
857 	ctl_sk->sk_mark = 0;
858 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
859 	local_bh_enable();
860 }
861 
862 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
863 {
864 	struct inet_timewait_sock *tw = inet_twsk(sk);
865 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
866 
867 	tcp_v4_send_ack(sk, skb,
868 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
869 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
870 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
871 			tcptw->tw_ts_recent,
872 			tw->tw_bound_dev_if,
873 			tcp_twsk_md5_key(tcptw),
874 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
875 			tw->tw_tos
876 			);
877 
878 	inet_twsk_put(tw);
879 }
880 
881 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
882 				  struct request_sock *req)
883 {
884 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
885 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
886 	 */
887 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
888 					     tcp_sk(sk)->snd_nxt;
889 
890 	/* RFC 7323 2.3
891 	 * The window field (SEG.WND) of every outgoing segment, with the
892 	 * exception of <SYN> segments, MUST be right-shifted by
893 	 * Rcv.Wind.Shift bits:
894 	 */
895 	tcp_v4_send_ack(sk, skb, seq,
896 			tcp_rsk(req)->rcv_nxt,
897 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
898 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
899 			req->ts_recent,
900 			0,
901 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
902 					  AF_INET),
903 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
904 			ip_hdr(skb)->tos);
905 }
906 
907 /*
908  *	Send a SYN-ACK after having received a SYN.
909  *	This still operates on a request_sock only, not on a big
910  *	socket.
911  */
912 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
913 			      struct flowi *fl,
914 			      struct request_sock *req,
915 			      struct tcp_fastopen_cookie *foc,
916 			      enum tcp_synack_type synack_type)
917 {
918 	const struct inet_request_sock *ireq = inet_rsk(req);
919 	struct flowi4 fl4;
920 	int err = -1;
921 	struct sk_buff *skb;
922 
923 	/* First, grab a route. */
924 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
925 		return -1;
926 
927 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
928 
929 	if (skb) {
930 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
931 
932 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
933 					    ireq->ir_rmt_addr,
934 					    ireq_opt_deref(ireq));
935 		err = net_xmit_eval(err);
936 	}
937 
938 	return err;
939 }
940 
941 /*
942  *	IPv4 request_sock destructor.
943  */
944 static void tcp_v4_reqsk_destructor(struct request_sock *req)
945 {
946 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
947 }
948 
949 #ifdef CONFIG_TCP_MD5SIG
950 /*
951  * RFC2385 MD5 checksumming requires a mapping of
952  * IP address->MD5 Key.
953  * We need to maintain these in the sk structure.
954  */
955 
956 /* Find the Key structure for an address.  */
957 struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
958 					 const union tcp_md5_addr *addr,
959 					 int family)
960 {
961 	const struct tcp_sock *tp = tcp_sk(sk);
962 	struct tcp_md5sig_key *key;
963 	const struct tcp_md5sig_info *md5sig;
964 	__be32 mask;
965 	struct tcp_md5sig_key *best_match = NULL;
966 	bool match;
967 
968 	/* caller either holds rcu_read_lock() or socket lock */
969 	md5sig = rcu_dereference_check(tp->md5sig_info,
970 				       lockdep_sock_is_held(sk));
971 	if (!md5sig)
972 		return NULL;
973 
974 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
975 		if (key->family != family)
976 			continue;
977 
978 		if (family == AF_INET) {
979 			mask = inet_make_mask(key->prefixlen);
980 			match = (key->addr.a4.s_addr & mask) ==
981 				(addr->a4.s_addr & mask);
982 #if IS_ENABLED(CONFIG_IPV6)
983 		} else if (family == AF_INET6) {
984 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
985 						  key->prefixlen);
986 #endif
987 		} else {
988 			match = false;
989 		}
990 
991 		if (match && (!best_match ||
992 			      key->prefixlen > best_match->prefixlen))
993 			best_match = key;
994 	}
995 	return best_match;
996 }
997 EXPORT_SYMBOL(tcp_md5_do_lookup);
998 
999 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1000 						      const union tcp_md5_addr *addr,
1001 						      int family, u8 prefixlen)
1002 {
1003 	const struct tcp_sock *tp = tcp_sk(sk);
1004 	struct tcp_md5sig_key *key;
1005 	unsigned int size = sizeof(struct in_addr);
1006 	const struct tcp_md5sig_info *md5sig;
1007 
1008 	/* caller either holds rcu_read_lock() or socket lock */
1009 	md5sig = rcu_dereference_check(tp->md5sig_info,
1010 				       lockdep_sock_is_held(sk));
1011 	if (!md5sig)
1012 		return NULL;
1013 #if IS_ENABLED(CONFIG_IPV6)
1014 	if (family == AF_INET6)
1015 		size = sizeof(struct in6_addr);
1016 #endif
1017 	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
1018 		if (key->family != family)
1019 			continue;
1020 		if (!memcmp(&key->addr, addr, size) &&
1021 		    key->prefixlen == prefixlen)
1022 			return key;
1023 	}
1024 	return NULL;
1025 }
1026 
1027 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1028 					 const struct sock *addr_sk)
1029 {
1030 	const union tcp_md5_addr *addr;
1031 
1032 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1033 	return tcp_md5_do_lookup(sk, addr, AF_INET);
1034 }
1035 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1036 
1037 /* This can be called on a newly created socket, from other files */
1038 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1039 		   int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
1040 		   gfp_t gfp)
1041 {
1042 	/* Add Key to the list */
1043 	struct tcp_md5sig_key *key;
1044 	struct tcp_sock *tp = tcp_sk(sk);
1045 	struct tcp_md5sig_info *md5sig;
1046 
1047 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1048 	if (key) {
1049 		/* Pre-existing entry - just update that one. */
1050 		memcpy(key->key, newkey, newkeylen);
1051 		key->keylen = newkeylen;
1052 		return 0;
1053 	}
1054 
1055 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1056 					   lockdep_sock_is_held(sk));
1057 	if (!md5sig) {
1058 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1059 		if (!md5sig)
1060 			return -ENOMEM;
1061 
1062 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1063 		INIT_HLIST_HEAD(&md5sig->head);
1064 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1065 	}
1066 
1067 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1068 	if (!key)
1069 		return -ENOMEM;
1070 	if (!tcp_alloc_md5sig_pool()) {
1071 		sock_kfree_s(sk, key, sizeof(*key));
1072 		return -ENOMEM;
1073 	}
1074 
1075 	memcpy(key->key, newkey, newkeylen);
1076 	key->keylen = newkeylen;
1077 	key->family = family;
1078 	key->prefixlen = prefixlen;
1079 	memcpy(&key->addr, addr,
1080 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1081 				      sizeof(struct in_addr));
1082 	hlist_add_head_rcu(&key->node, &md5sig->head);
1083 	return 0;
1084 }
1085 EXPORT_SYMBOL(tcp_md5_do_add);
1086 
1087 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1088 		   u8 prefixlen)
1089 {
1090 	struct tcp_md5sig_key *key;
1091 
1092 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1093 	if (!key)
1094 		return -ENOENT;
1095 	hlist_del_rcu(&key->node);
1096 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1097 	kfree_rcu(key, rcu);
1098 	return 0;
1099 }
1100 EXPORT_SYMBOL(tcp_md5_do_del);
1101 
1102 static void tcp_clear_md5_list(struct sock *sk)
1103 {
1104 	struct tcp_sock *tp = tcp_sk(sk);
1105 	struct tcp_md5sig_key *key;
1106 	struct hlist_node *n;
1107 	struct tcp_md5sig_info *md5sig;
1108 
1109 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1110 
1111 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1112 		hlist_del_rcu(&key->node);
1113 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1114 		kfree_rcu(key, rcu);
1115 	}
1116 }
1117 
1118 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1119 				 char __user *optval, int optlen)
1120 {
1121 	struct tcp_md5sig cmd;
1122 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1123 	u8 prefixlen = 32;
1124 
1125 	if (optlen < sizeof(cmd))
1126 		return -EINVAL;
1127 
1128 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1129 		return -EFAULT;
1130 
1131 	if (sin->sin_family != AF_INET)
1132 		return -EINVAL;
1133 
1134 	if (optname == TCP_MD5SIG_EXT &&
1135 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1136 		prefixlen = cmd.tcpm_prefixlen;
1137 		if (prefixlen > 32)
1138 			return -EINVAL;
1139 	}
1140 
1141 	if (!cmd.tcpm_keylen)
1142 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1143 				      AF_INET, prefixlen);
1144 
1145 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1146 		return -EINVAL;
1147 
1148 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1149 			      AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1150 			      GFP_KERNEL);
1151 }
1152 
1153 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1154 				   __be32 daddr, __be32 saddr,
1155 				   const struct tcphdr *th, int nbytes)
1156 {
1157 	struct tcp4_pseudohdr *bp;
1158 	struct scatterlist sg;
1159 	struct tcphdr *_th;
1160 
1161 	bp = hp->scratch;
1162 	bp->saddr = saddr;
1163 	bp->daddr = daddr;
1164 	bp->pad = 0;
1165 	bp->protocol = IPPROTO_TCP;
1166 	bp->len = cpu_to_be16(nbytes);
1167 
1168 	_th = (struct tcphdr *)(bp + 1);
1169 	memcpy(_th, th, sizeof(*th));
1170 	_th->check = 0;
1171 
1172 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1173 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1174 				sizeof(*bp) + sizeof(*th));
1175 	return crypto_ahash_update(hp->md5_req);
1176 }
1177 
1178 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1179 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1180 {
1181 	struct tcp_md5sig_pool *hp;
1182 	struct ahash_request *req;
1183 
1184 	hp = tcp_get_md5sig_pool();
1185 	if (!hp)
1186 		goto clear_hash_noput;
1187 	req = hp->md5_req;
1188 
1189 	if (crypto_ahash_init(req))
1190 		goto clear_hash;
1191 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1192 		goto clear_hash;
1193 	if (tcp_md5_hash_key(hp, key))
1194 		goto clear_hash;
1195 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1196 	if (crypto_ahash_final(req))
1197 		goto clear_hash;
1198 
1199 	tcp_put_md5sig_pool();
1200 	return 0;
1201 
1202 clear_hash:
1203 	tcp_put_md5sig_pool();
1204 clear_hash_noput:
1205 	memset(md5_hash, 0, 16);
1206 	return 1;
1207 }
1208 
1209 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1210 			const struct sock *sk,
1211 			const struct sk_buff *skb)
1212 {
1213 	struct tcp_md5sig_pool *hp;
1214 	struct ahash_request *req;
1215 	const struct tcphdr *th = tcp_hdr(skb);
1216 	__be32 saddr, daddr;
1217 
1218 	if (sk) { /* valid for establish/request sockets */
1219 		saddr = sk->sk_rcv_saddr;
1220 		daddr = sk->sk_daddr;
1221 	} else {
1222 		const struct iphdr *iph = ip_hdr(skb);
1223 		saddr = iph->saddr;
1224 		daddr = iph->daddr;
1225 	}
1226 
1227 	hp = tcp_get_md5sig_pool();
1228 	if (!hp)
1229 		goto clear_hash_noput;
1230 	req = hp->md5_req;
1231 
1232 	if (crypto_ahash_init(req))
1233 		goto clear_hash;
1234 
1235 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1236 		goto clear_hash;
1237 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1238 		goto clear_hash;
1239 	if (tcp_md5_hash_key(hp, key))
1240 		goto clear_hash;
1241 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1242 	if (crypto_ahash_final(req))
1243 		goto clear_hash;
1244 
1245 	tcp_put_md5sig_pool();
1246 	return 0;
1247 
1248 clear_hash:
1249 	tcp_put_md5sig_pool();
1250 clear_hash_noput:
1251 	memset(md5_hash, 0, 16);
1252 	return 1;
1253 }
1254 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1255 
1256 #endif
1257 
1258 /* Called with rcu_read_lock() */
1259 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1260 				    const struct sk_buff *skb)
1261 {
1262 #ifdef CONFIG_TCP_MD5SIG
1263 	/*
1264 	 * This gets called for each TCP segment that arrives
1265 	 * so we want to be efficient.
1266 	 * We have 3 drop cases:
1267 	 * o No MD5 hash and one expected.
1268 	 * o MD5 hash and we're not expecting one.
1269 	 * o MD5 hash and its wrong.
1270 	 */
1271 	const __u8 *hash_location = NULL;
1272 	struct tcp_md5sig_key *hash_expected;
1273 	const struct iphdr *iph = ip_hdr(skb);
1274 	const struct tcphdr *th = tcp_hdr(skb);
1275 	int genhash;
1276 	unsigned char newhash[16];
1277 
1278 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1279 					  AF_INET);
1280 	hash_location = tcp_parse_md5sig_option(th);
1281 
1282 	/* We've parsed the options - do we have a hash? */
1283 	if (!hash_expected && !hash_location)
1284 		return false;
1285 
1286 	if (hash_expected && !hash_location) {
1287 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1288 		return true;
1289 	}
1290 
1291 	if (!hash_expected && hash_location) {
1292 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1293 		return true;
1294 	}
1295 
1296 	/* Okay, so this is hash_expected and hash_location -
1297 	 * so we need to calculate the checksum.
1298 	 */
1299 	genhash = tcp_v4_md5_hash_skb(newhash,
1300 				      hash_expected,
1301 				      NULL, skb);
1302 
1303 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1304 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1305 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1306 				     &iph->saddr, ntohs(th->source),
1307 				     &iph->daddr, ntohs(th->dest),
1308 				     genhash ? " tcp_v4_calc_md5_hash failed"
1309 				     : "");
1310 		return true;
1311 	}
1312 	return false;
1313 #endif
1314 	return false;
1315 }
1316 
1317 static void tcp_v4_init_req(struct request_sock *req,
1318 			    const struct sock *sk_listener,
1319 			    struct sk_buff *skb)
1320 {
1321 	struct inet_request_sock *ireq = inet_rsk(req);
1322 	struct net *net = sock_net(sk_listener);
1323 
1324 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1325 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1326 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1327 }
1328 
1329 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1330 					  struct flowi *fl,
1331 					  const struct request_sock *req)
1332 {
1333 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1334 }
1335 
1336 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1337 	.family		=	PF_INET,
1338 	.obj_size	=	sizeof(struct tcp_request_sock),
1339 	.rtx_syn_ack	=	tcp_rtx_synack,
1340 	.send_ack	=	tcp_v4_reqsk_send_ack,
1341 	.destructor	=	tcp_v4_reqsk_destructor,
1342 	.send_reset	=	tcp_v4_send_reset,
1343 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1344 };
1345 
1346 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1347 	.mss_clamp	=	TCP_MSS_DEFAULT,
1348 #ifdef CONFIG_TCP_MD5SIG
1349 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1350 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1351 #endif
1352 	.init_req	=	tcp_v4_init_req,
1353 #ifdef CONFIG_SYN_COOKIES
1354 	.cookie_init_seq =	cookie_v4_init_sequence,
1355 #endif
1356 	.route_req	=	tcp_v4_route_req,
1357 	.init_seq	=	tcp_v4_init_seq,
1358 	.init_ts_off	=	tcp_v4_init_ts_off,
1359 	.send_synack	=	tcp_v4_send_synack,
1360 };
1361 
1362 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1363 {
1364 	/* Never answer to SYNs send to broadcast or multicast */
1365 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1366 		goto drop;
1367 
1368 	return tcp_conn_request(&tcp_request_sock_ops,
1369 				&tcp_request_sock_ipv4_ops, sk, skb);
1370 
1371 drop:
1372 	tcp_listendrop(sk);
1373 	return 0;
1374 }
1375 EXPORT_SYMBOL(tcp_v4_conn_request);
1376 
1377 
1378 /*
1379  * The three way handshake has completed - we got a valid synack -
1380  * now create the new socket.
1381  */
1382 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1383 				  struct request_sock *req,
1384 				  struct dst_entry *dst,
1385 				  struct request_sock *req_unhash,
1386 				  bool *own_req)
1387 {
1388 	struct inet_request_sock *ireq;
1389 	struct inet_sock *newinet;
1390 	struct tcp_sock *newtp;
1391 	struct sock *newsk;
1392 #ifdef CONFIG_TCP_MD5SIG
1393 	struct tcp_md5sig_key *key;
1394 #endif
1395 	struct ip_options_rcu *inet_opt;
1396 
1397 	if (sk_acceptq_is_full(sk))
1398 		goto exit_overflow;
1399 
1400 	newsk = tcp_create_openreq_child(sk, req, skb);
1401 	if (!newsk)
1402 		goto exit_nonewsk;
1403 
1404 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1405 	inet_sk_rx_dst_set(newsk, skb);
1406 
1407 	newtp		      = tcp_sk(newsk);
1408 	newinet		      = inet_sk(newsk);
1409 	ireq		      = inet_rsk(req);
1410 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1411 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1412 	newsk->sk_bound_dev_if = ireq->ir_iif;
1413 	newinet->inet_saddr   = ireq->ir_loc_addr;
1414 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1415 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1416 	newinet->mc_index     = inet_iif(skb);
1417 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1418 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1419 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1420 	if (inet_opt)
1421 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1422 	newinet->inet_id = newtp->write_seq ^ jiffies;
1423 
1424 	if (!dst) {
1425 		dst = inet_csk_route_child_sock(sk, newsk, req);
1426 		if (!dst)
1427 			goto put_and_exit;
1428 	} else {
1429 		/* syncookie case : see end of cookie_v4_check() */
1430 	}
1431 	sk_setup_caps(newsk, dst);
1432 
1433 	tcp_ca_openreq_child(newsk, dst);
1434 
1435 	tcp_sync_mss(newsk, dst_mtu(dst));
1436 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1437 
1438 	tcp_initialize_rcv_mss(newsk);
1439 
1440 #ifdef CONFIG_TCP_MD5SIG
1441 	/* Copy over the MD5 key from the original socket */
1442 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1443 				AF_INET);
1444 	if (key) {
1445 		/*
1446 		 * We're using one, so create a matching key
1447 		 * on the newsk structure. If we fail to get
1448 		 * memory, then we end up not copying the key
1449 		 * across. Shucks.
1450 		 */
1451 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1452 			       AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1453 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1454 	}
1455 #endif
1456 
1457 	if (__inet_inherit_port(sk, newsk) < 0)
1458 		goto put_and_exit;
1459 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1460 	if (likely(*own_req)) {
1461 		tcp_move_syn(newtp, req);
1462 		ireq->ireq_opt = NULL;
1463 	} else {
1464 		newinet->inet_opt = NULL;
1465 	}
1466 	return newsk;
1467 
1468 exit_overflow:
1469 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1470 exit_nonewsk:
1471 	dst_release(dst);
1472 exit:
1473 	tcp_listendrop(sk);
1474 	return NULL;
1475 put_and_exit:
1476 	newinet->inet_opt = NULL;
1477 	inet_csk_prepare_forced_close(newsk);
1478 	tcp_done(newsk);
1479 	goto exit;
1480 }
1481 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1482 
1483 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1484 {
1485 #ifdef CONFIG_SYN_COOKIES
1486 	const struct tcphdr *th = tcp_hdr(skb);
1487 
1488 	if (!th->syn)
1489 		sk = cookie_v4_check(sk, skb);
1490 #endif
1491 	return sk;
1492 }
1493 
1494 /* The socket must have it's spinlock held when we get
1495  * here, unless it is a TCP_LISTEN socket.
1496  *
1497  * We have a potential double-lock case here, so even when
1498  * doing backlog processing we use the BH locking scheme.
1499  * This is because we cannot sleep with the original spinlock
1500  * held.
1501  */
1502 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1503 {
1504 	struct sock *rsk;
1505 
1506 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1507 		struct dst_entry *dst = sk->sk_rx_dst;
1508 
1509 		sock_rps_save_rxhash(sk, skb);
1510 		sk_mark_napi_id(sk, skb);
1511 		if (dst) {
1512 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1513 			    !dst->ops->check(dst, 0)) {
1514 				dst_release(dst);
1515 				sk->sk_rx_dst = NULL;
1516 			}
1517 		}
1518 		tcp_rcv_established(sk, skb);
1519 		return 0;
1520 	}
1521 
1522 	if (tcp_checksum_complete(skb))
1523 		goto csum_err;
1524 
1525 	if (sk->sk_state == TCP_LISTEN) {
1526 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1527 
1528 		if (!nsk)
1529 			goto discard;
1530 		if (nsk != sk) {
1531 			if (tcp_child_process(sk, nsk, skb)) {
1532 				rsk = nsk;
1533 				goto reset;
1534 			}
1535 			return 0;
1536 		}
1537 	} else
1538 		sock_rps_save_rxhash(sk, skb);
1539 
1540 	if (tcp_rcv_state_process(sk, skb)) {
1541 		rsk = sk;
1542 		goto reset;
1543 	}
1544 	return 0;
1545 
1546 reset:
1547 	tcp_v4_send_reset(rsk, skb);
1548 discard:
1549 	kfree_skb(skb);
1550 	/* Be careful here. If this function gets more complicated and
1551 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1552 	 * might be destroyed here. This current version compiles correctly,
1553 	 * but you have been warned.
1554 	 */
1555 	return 0;
1556 
1557 csum_err:
1558 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1559 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1560 	goto discard;
1561 }
1562 EXPORT_SYMBOL(tcp_v4_do_rcv);
1563 
1564 int tcp_v4_early_demux(struct sk_buff *skb)
1565 {
1566 	const struct iphdr *iph;
1567 	const struct tcphdr *th;
1568 	struct sock *sk;
1569 
1570 	if (skb->pkt_type != PACKET_HOST)
1571 		return 0;
1572 
1573 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1574 		return 0;
1575 
1576 	iph = ip_hdr(skb);
1577 	th = tcp_hdr(skb);
1578 
1579 	if (th->doff < sizeof(struct tcphdr) / 4)
1580 		return 0;
1581 
1582 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1583 				       iph->saddr, th->source,
1584 				       iph->daddr, ntohs(th->dest),
1585 				       skb->skb_iif, inet_sdif(skb));
1586 	if (sk) {
1587 		skb->sk = sk;
1588 		skb->destructor = sock_edemux;
1589 		if (sk_fullsock(sk)) {
1590 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1591 
1592 			if (dst)
1593 				dst = dst_check(dst, 0);
1594 			if (dst &&
1595 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1596 				skb_dst_set_noref(skb, dst);
1597 		}
1598 	}
1599 	return 0;
1600 }
1601 
1602 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1603 {
1604 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1605 
1606 	/* Only socket owner can try to collapse/prune rx queues
1607 	 * to reduce memory overhead, so add a little headroom here.
1608 	 * Few sockets backlog are possibly concurrently non empty.
1609 	 */
1610 	limit += 64*1024;
1611 
1612 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1613 	 * we can fix skb->truesize to its real value to avoid future drops.
1614 	 * This is valid because skb is not yet charged to the socket.
1615 	 * It has been noticed pure SACK packets were sometimes dropped
1616 	 * (if cooked by drivers without copybreak feature).
1617 	 */
1618 	skb_condense(skb);
1619 
1620 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1621 		bh_unlock_sock(sk);
1622 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1623 		return true;
1624 	}
1625 	return false;
1626 }
1627 EXPORT_SYMBOL(tcp_add_backlog);
1628 
1629 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1630 {
1631 	struct tcphdr *th = (struct tcphdr *)skb->data;
1632 	unsigned int eaten = skb->len;
1633 	int err;
1634 
1635 	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1636 	if (!err) {
1637 		eaten -= skb->len;
1638 		TCP_SKB_CB(skb)->end_seq -= eaten;
1639 	}
1640 	return err;
1641 }
1642 EXPORT_SYMBOL(tcp_filter);
1643 
1644 static void tcp_v4_restore_cb(struct sk_buff *skb)
1645 {
1646 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1647 		sizeof(struct inet_skb_parm));
1648 }
1649 
1650 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1651 			   const struct tcphdr *th)
1652 {
1653 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1654 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1655 	 */
1656 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1657 		sizeof(struct inet_skb_parm));
1658 	barrier();
1659 
1660 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1661 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1662 				    skb->len - th->doff * 4);
1663 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1664 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1665 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1666 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1667 	TCP_SKB_CB(skb)->sacked	 = 0;
1668 	TCP_SKB_CB(skb)->has_rxtstamp =
1669 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1670 }
1671 
1672 /*
1673  *	From tcp_input.c
1674  */
1675 
1676 int tcp_v4_rcv(struct sk_buff *skb)
1677 {
1678 	struct net *net = dev_net(skb->dev);
1679 	int sdif = inet_sdif(skb);
1680 	const struct iphdr *iph;
1681 	const struct tcphdr *th;
1682 	bool refcounted;
1683 	struct sock *sk;
1684 	int ret;
1685 
1686 	if (skb->pkt_type != PACKET_HOST)
1687 		goto discard_it;
1688 
1689 	/* Count it even if it's bad */
1690 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1691 
1692 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1693 		goto discard_it;
1694 
1695 	th = (const struct tcphdr *)skb->data;
1696 
1697 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1698 		goto bad_packet;
1699 	if (!pskb_may_pull(skb, th->doff * 4))
1700 		goto discard_it;
1701 
1702 	/* An explanation is required here, I think.
1703 	 * Packet length and doff are validated by header prediction,
1704 	 * provided case of th->doff==0 is eliminated.
1705 	 * So, we defer the checks. */
1706 
1707 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1708 		goto csum_error;
1709 
1710 	th = (const struct tcphdr *)skb->data;
1711 	iph = ip_hdr(skb);
1712 lookup:
1713 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1714 			       th->dest, sdif, &refcounted);
1715 	if (!sk)
1716 		goto no_tcp_socket;
1717 
1718 process:
1719 	if (sk->sk_state == TCP_TIME_WAIT)
1720 		goto do_time_wait;
1721 
1722 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1723 		struct request_sock *req = inet_reqsk(sk);
1724 		bool req_stolen = false;
1725 		struct sock *nsk;
1726 
1727 		sk = req->rsk_listener;
1728 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1729 			sk_drops_add(sk, skb);
1730 			reqsk_put(req);
1731 			goto discard_it;
1732 		}
1733 		if (tcp_checksum_complete(skb)) {
1734 			reqsk_put(req);
1735 			goto csum_error;
1736 		}
1737 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1738 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1739 			goto lookup;
1740 		}
1741 		/* We own a reference on the listener, increase it again
1742 		 * as we might lose it too soon.
1743 		 */
1744 		sock_hold(sk);
1745 		refcounted = true;
1746 		nsk = NULL;
1747 		if (!tcp_filter(sk, skb)) {
1748 			th = (const struct tcphdr *)skb->data;
1749 			iph = ip_hdr(skb);
1750 			tcp_v4_fill_cb(skb, iph, th);
1751 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1752 		}
1753 		if (!nsk) {
1754 			reqsk_put(req);
1755 			if (req_stolen) {
1756 				/* Another cpu got exclusive access to req
1757 				 * and created a full blown socket.
1758 				 * Try to feed this packet to this socket
1759 				 * instead of discarding it.
1760 				 */
1761 				tcp_v4_restore_cb(skb);
1762 				sock_put(sk);
1763 				goto lookup;
1764 			}
1765 			goto discard_and_relse;
1766 		}
1767 		if (nsk == sk) {
1768 			reqsk_put(req);
1769 			tcp_v4_restore_cb(skb);
1770 		} else if (tcp_child_process(sk, nsk, skb)) {
1771 			tcp_v4_send_reset(nsk, skb);
1772 			goto discard_and_relse;
1773 		} else {
1774 			sock_put(sk);
1775 			return 0;
1776 		}
1777 	}
1778 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1779 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1780 		goto discard_and_relse;
1781 	}
1782 
1783 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1784 		goto discard_and_relse;
1785 
1786 	if (tcp_v4_inbound_md5_hash(sk, skb))
1787 		goto discard_and_relse;
1788 
1789 	nf_reset(skb);
1790 
1791 	if (tcp_filter(sk, skb))
1792 		goto discard_and_relse;
1793 	th = (const struct tcphdr *)skb->data;
1794 	iph = ip_hdr(skb);
1795 	tcp_v4_fill_cb(skb, iph, th);
1796 
1797 	skb->dev = NULL;
1798 
1799 	if (sk->sk_state == TCP_LISTEN) {
1800 		ret = tcp_v4_do_rcv(sk, skb);
1801 		goto put_and_return;
1802 	}
1803 
1804 	sk_incoming_cpu_update(sk);
1805 
1806 	bh_lock_sock_nested(sk);
1807 	tcp_segs_in(tcp_sk(sk), skb);
1808 	ret = 0;
1809 	if (!sock_owned_by_user(sk)) {
1810 		ret = tcp_v4_do_rcv(sk, skb);
1811 	} else if (tcp_add_backlog(sk, skb)) {
1812 		goto discard_and_relse;
1813 	}
1814 	bh_unlock_sock(sk);
1815 
1816 put_and_return:
1817 	if (refcounted)
1818 		sock_put(sk);
1819 
1820 	return ret;
1821 
1822 no_tcp_socket:
1823 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1824 		goto discard_it;
1825 
1826 	tcp_v4_fill_cb(skb, iph, th);
1827 
1828 	if (tcp_checksum_complete(skb)) {
1829 csum_error:
1830 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1831 bad_packet:
1832 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
1833 	} else {
1834 		tcp_v4_send_reset(NULL, skb);
1835 	}
1836 
1837 discard_it:
1838 	/* Discard frame. */
1839 	kfree_skb(skb);
1840 	return 0;
1841 
1842 discard_and_relse:
1843 	sk_drops_add(sk, skb);
1844 	if (refcounted)
1845 		sock_put(sk);
1846 	goto discard_it;
1847 
1848 do_time_wait:
1849 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1850 		inet_twsk_put(inet_twsk(sk));
1851 		goto discard_it;
1852 	}
1853 
1854 	tcp_v4_fill_cb(skb, iph, th);
1855 
1856 	if (tcp_checksum_complete(skb)) {
1857 		inet_twsk_put(inet_twsk(sk));
1858 		goto csum_error;
1859 	}
1860 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1861 	case TCP_TW_SYN: {
1862 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1863 							&tcp_hashinfo, skb,
1864 							__tcp_hdrlen(th),
1865 							iph->saddr, th->source,
1866 							iph->daddr, th->dest,
1867 							inet_iif(skb),
1868 							sdif);
1869 		if (sk2) {
1870 			inet_twsk_deschedule_put(inet_twsk(sk));
1871 			sk = sk2;
1872 			tcp_v4_restore_cb(skb);
1873 			refcounted = false;
1874 			goto process;
1875 		}
1876 	}
1877 		/* to ACK */
1878 		/* fall through */
1879 	case TCP_TW_ACK:
1880 		tcp_v4_timewait_ack(sk, skb);
1881 		break;
1882 	case TCP_TW_RST:
1883 		tcp_v4_send_reset(sk, skb);
1884 		inet_twsk_deschedule_put(inet_twsk(sk));
1885 		goto discard_it;
1886 	case TCP_TW_SUCCESS:;
1887 	}
1888 	goto discard_it;
1889 }
1890 
1891 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1892 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1893 	.twsk_unique	= tcp_twsk_unique,
1894 	.twsk_destructor= tcp_twsk_destructor,
1895 };
1896 
1897 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1898 {
1899 	struct dst_entry *dst = skb_dst(skb);
1900 
1901 	if (dst && dst_hold_safe(dst)) {
1902 		sk->sk_rx_dst = dst;
1903 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1904 	}
1905 }
1906 EXPORT_SYMBOL(inet_sk_rx_dst_set);
1907 
1908 const struct inet_connection_sock_af_ops ipv4_specific = {
1909 	.queue_xmit	   = ip_queue_xmit,
1910 	.send_check	   = tcp_v4_send_check,
1911 	.rebuild_header	   = inet_sk_rebuild_header,
1912 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
1913 	.conn_request	   = tcp_v4_conn_request,
1914 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1915 	.net_header_len	   = sizeof(struct iphdr),
1916 	.setsockopt	   = ip_setsockopt,
1917 	.getsockopt	   = ip_getsockopt,
1918 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1919 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1920 #ifdef CONFIG_COMPAT
1921 	.compat_setsockopt = compat_ip_setsockopt,
1922 	.compat_getsockopt = compat_ip_getsockopt,
1923 #endif
1924 	.mtu_reduced	   = tcp_v4_mtu_reduced,
1925 };
1926 EXPORT_SYMBOL(ipv4_specific);
1927 
1928 #ifdef CONFIG_TCP_MD5SIG
1929 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1930 	.md5_lookup		= tcp_v4_md5_lookup,
1931 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1932 	.md5_parse		= tcp_v4_parse_md5_keys,
1933 };
1934 #endif
1935 
1936 /* NOTE: A lot of things set to zero explicitly by call to
1937  *       sk_alloc() so need not be done here.
1938  */
1939 static int tcp_v4_init_sock(struct sock *sk)
1940 {
1941 	struct inet_connection_sock *icsk = inet_csk(sk);
1942 
1943 	tcp_init_sock(sk);
1944 
1945 	icsk->icsk_af_ops = &ipv4_specific;
1946 
1947 #ifdef CONFIG_TCP_MD5SIG
1948 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1949 #endif
1950 
1951 	return 0;
1952 }
1953 
1954 void tcp_v4_destroy_sock(struct sock *sk)
1955 {
1956 	struct tcp_sock *tp = tcp_sk(sk);
1957 
1958 	trace_tcp_destroy_sock(sk);
1959 
1960 	tcp_clear_xmit_timers(sk);
1961 
1962 	tcp_cleanup_congestion_control(sk);
1963 
1964 	tcp_cleanup_ulp(sk);
1965 
1966 	/* Cleanup up the write buffer. */
1967 	tcp_write_queue_purge(sk);
1968 
1969 	/* Check if we want to disable active TFO */
1970 	tcp_fastopen_active_disable_ofo_check(sk);
1971 
1972 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1973 	skb_rbtree_purge(&tp->out_of_order_queue);
1974 
1975 #ifdef CONFIG_TCP_MD5SIG
1976 	/* Clean up the MD5 key list, if any */
1977 	if (tp->md5sig_info) {
1978 		tcp_clear_md5_list(sk);
1979 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1980 		tp->md5sig_info = NULL;
1981 	}
1982 #endif
1983 
1984 	/* Clean up a referenced TCP bind bucket. */
1985 	if (inet_csk(sk)->icsk_bind_hash)
1986 		inet_put_port(sk);
1987 
1988 	BUG_ON(tp->fastopen_rsk);
1989 
1990 	/* If socket is aborted during connect operation */
1991 	tcp_free_fastopen_req(tp);
1992 	tcp_fastopen_destroy_cipher(sk);
1993 	tcp_saved_syn_free(tp);
1994 
1995 	sk_sockets_allocated_dec(sk);
1996 }
1997 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1998 
1999 #ifdef CONFIG_PROC_FS
2000 /* Proc filesystem TCP sock list dumping. */
2001 
2002 /*
2003  * Get next listener socket follow cur.  If cur is NULL, get first socket
2004  * starting from bucket given in st->bucket; when st->bucket is zero the
2005  * very first socket in the hash table is returned.
2006  */
2007 static void *listening_get_next(struct seq_file *seq, void *cur)
2008 {
2009 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2010 	struct tcp_iter_state *st = seq->private;
2011 	struct net *net = seq_file_net(seq);
2012 	struct inet_listen_hashbucket *ilb;
2013 	struct sock *sk = cur;
2014 
2015 	if (!sk) {
2016 get_head:
2017 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2018 		spin_lock(&ilb->lock);
2019 		sk = sk_head(&ilb->head);
2020 		st->offset = 0;
2021 		goto get_sk;
2022 	}
2023 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2024 	++st->num;
2025 	++st->offset;
2026 
2027 	sk = sk_next(sk);
2028 get_sk:
2029 	sk_for_each_from(sk) {
2030 		if (!net_eq(sock_net(sk), net))
2031 			continue;
2032 		if (sk->sk_family == afinfo->family)
2033 			return sk;
2034 	}
2035 	spin_unlock(&ilb->lock);
2036 	st->offset = 0;
2037 	if (++st->bucket < INET_LHTABLE_SIZE)
2038 		goto get_head;
2039 	return NULL;
2040 }
2041 
2042 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2043 {
2044 	struct tcp_iter_state *st = seq->private;
2045 	void *rc;
2046 
2047 	st->bucket = 0;
2048 	st->offset = 0;
2049 	rc = listening_get_next(seq, NULL);
2050 
2051 	while (rc && *pos) {
2052 		rc = listening_get_next(seq, rc);
2053 		--*pos;
2054 	}
2055 	return rc;
2056 }
2057 
2058 static inline bool empty_bucket(const struct tcp_iter_state *st)
2059 {
2060 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2061 }
2062 
2063 /*
2064  * Get first established socket starting from bucket given in st->bucket.
2065  * If st->bucket is zero, the very first socket in the hash is returned.
2066  */
2067 static void *established_get_first(struct seq_file *seq)
2068 {
2069 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2070 	struct tcp_iter_state *st = seq->private;
2071 	struct net *net = seq_file_net(seq);
2072 	void *rc = NULL;
2073 
2074 	st->offset = 0;
2075 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2076 		struct sock *sk;
2077 		struct hlist_nulls_node *node;
2078 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2079 
2080 		/* Lockless fast path for the common case of empty buckets */
2081 		if (empty_bucket(st))
2082 			continue;
2083 
2084 		spin_lock_bh(lock);
2085 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2086 			if (sk->sk_family != afinfo->family ||
2087 			    !net_eq(sock_net(sk), net)) {
2088 				continue;
2089 			}
2090 			rc = sk;
2091 			goto out;
2092 		}
2093 		spin_unlock_bh(lock);
2094 	}
2095 out:
2096 	return rc;
2097 }
2098 
2099 static void *established_get_next(struct seq_file *seq, void *cur)
2100 {
2101 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2102 	struct sock *sk = cur;
2103 	struct hlist_nulls_node *node;
2104 	struct tcp_iter_state *st = seq->private;
2105 	struct net *net = seq_file_net(seq);
2106 
2107 	++st->num;
2108 	++st->offset;
2109 
2110 	sk = sk_nulls_next(sk);
2111 
2112 	sk_nulls_for_each_from(sk, node) {
2113 		if (sk->sk_family == afinfo->family &&
2114 		    net_eq(sock_net(sk), net))
2115 			return sk;
2116 	}
2117 
2118 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2119 	++st->bucket;
2120 	return established_get_first(seq);
2121 }
2122 
2123 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2124 {
2125 	struct tcp_iter_state *st = seq->private;
2126 	void *rc;
2127 
2128 	st->bucket = 0;
2129 	rc = established_get_first(seq);
2130 
2131 	while (rc && pos) {
2132 		rc = established_get_next(seq, rc);
2133 		--pos;
2134 	}
2135 	return rc;
2136 }
2137 
2138 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2139 {
2140 	void *rc;
2141 	struct tcp_iter_state *st = seq->private;
2142 
2143 	st->state = TCP_SEQ_STATE_LISTENING;
2144 	rc	  = listening_get_idx(seq, &pos);
2145 
2146 	if (!rc) {
2147 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2148 		rc	  = established_get_idx(seq, pos);
2149 	}
2150 
2151 	return rc;
2152 }
2153 
2154 static void *tcp_seek_last_pos(struct seq_file *seq)
2155 {
2156 	struct tcp_iter_state *st = seq->private;
2157 	int offset = st->offset;
2158 	int orig_num = st->num;
2159 	void *rc = NULL;
2160 
2161 	switch (st->state) {
2162 	case TCP_SEQ_STATE_LISTENING:
2163 		if (st->bucket >= INET_LHTABLE_SIZE)
2164 			break;
2165 		st->state = TCP_SEQ_STATE_LISTENING;
2166 		rc = listening_get_next(seq, NULL);
2167 		while (offset-- && rc)
2168 			rc = listening_get_next(seq, rc);
2169 		if (rc)
2170 			break;
2171 		st->bucket = 0;
2172 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2173 		/* Fallthrough */
2174 	case TCP_SEQ_STATE_ESTABLISHED:
2175 		if (st->bucket > tcp_hashinfo.ehash_mask)
2176 			break;
2177 		rc = established_get_first(seq);
2178 		while (offset-- && rc)
2179 			rc = established_get_next(seq, rc);
2180 	}
2181 
2182 	st->num = orig_num;
2183 
2184 	return rc;
2185 }
2186 
2187 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2188 {
2189 	struct tcp_iter_state *st = seq->private;
2190 	void *rc;
2191 
2192 	if (*pos && *pos == st->last_pos) {
2193 		rc = tcp_seek_last_pos(seq);
2194 		if (rc)
2195 			goto out;
2196 	}
2197 
2198 	st->state = TCP_SEQ_STATE_LISTENING;
2199 	st->num = 0;
2200 	st->bucket = 0;
2201 	st->offset = 0;
2202 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2203 
2204 out:
2205 	st->last_pos = *pos;
2206 	return rc;
2207 }
2208 EXPORT_SYMBOL(tcp_seq_start);
2209 
2210 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2211 {
2212 	struct tcp_iter_state *st = seq->private;
2213 	void *rc = NULL;
2214 
2215 	if (v == SEQ_START_TOKEN) {
2216 		rc = tcp_get_idx(seq, 0);
2217 		goto out;
2218 	}
2219 
2220 	switch (st->state) {
2221 	case TCP_SEQ_STATE_LISTENING:
2222 		rc = listening_get_next(seq, v);
2223 		if (!rc) {
2224 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2225 			st->bucket = 0;
2226 			st->offset = 0;
2227 			rc	  = established_get_first(seq);
2228 		}
2229 		break;
2230 	case TCP_SEQ_STATE_ESTABLISHED:
2231 		rc = established_get_next(seq, v);
2232 		break;
2233 	}
2234 out:
2235 	++*pos;
2236 	st->last_pos = *pos;
2237 	return rc;
2238 }
2239 EXPORT_SYMBOL(tcp_seq_next);
2240 
2241 void tcp_seq_stop(struct seq_file *seq, void *v)
2242 {
2243 	struct tcp_iter_state *st = seq->private;
2244 
2245 	switch (st->state) {
2246 	case TCP_SEQ_STATE_LISTENING:
2247 		if (v != SEQ_START_TOKEN)
2248 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2249 		break;
2250 	case TCP_SEQ_STATE_ESTABLISHED:
2251 		if (v)
2252 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2253 		break;
2254 	}
2255 }
2256 EXPORT_SYMBOL(tcp_seq_stop);
2257 
2258 static void get_openreq4(const struct request_sock *req,
2259 			 struct seq_file *f, int i)
2260 {
2261 	const struct inet_request_sock *ireq = inet_rsk(req);
2262 	long delta = req->rsk_timer.expires - jiffies;
2263 
2264 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2265 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2266 		i,
2267 		ireq->ir_loc_addr,
2268 		ireq->ir_num,
2269 		ireq->ir_rmt_addr,
2270 		ntohs(ireq->ir_rmt_port),
2271 		TCP_SYN_RECV,
2272 		0, 0, /* could print option size, but that is af dependent. */
2273 		1,    /* timers active (only the expire timer) */
2274 		jiffies_delta_to_clock_t(delta),
2275 		req->num_timeout,
2276 		from_kuid_munged(seq_user_ns(f),
2277 				 sock_i_uid(req->rsk_listener)),
2278 		0,  /* non standard timer */
2279 		0, /* open_requests have no inode */
2280 		0,
2281 		req);
2282 }
2283 
2284 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2285 {
2286 	int timer_active;
2287 	unsigned long timer_expires;
2288 	const struct tcp_sock *tp = tcp_sk(sk);
2289 	const struct inet_connection_sock *icsk = inet_csk(sk);
2290 	const struct inet_sock *inet = inet_sk(sk);
2291 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2292 	__be32 dest = inet->inet_daddr;
2293 	__be32 src = inet->inet_rcv_saddr;
2294 	__u16 destp = ntohs(inet->inet_dport);
2295 	__u16 srcp = ntohs(inet->inet_sport);
2296 	int rx_queue;
2297 	int state;
2298 
2299 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2300 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2301 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2302 		timer_active	= 1;
2303 		timer_expires	= icsk->icsk_timeout;
2304 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2305 		timer_active	= 4;
2306 		timer_expires	= icsk->icsk_timeout;
2307 	} else if (timer_pending(&sk->sk_timer)) {
2308 		timer_active	= 2;
2309 		timer_expires	= sk->sk_timer.expires;
2310 	} else {
2311 		timer_active	= 0;
2312 		timer_expires = jiffies;
2313 	}
2314 
2315 	state = inet_sk_state_load(sk);
2316 	if (state == TCP_LISTEN)
2317 		rx_queue = sk->sk_ack_backlog;
2318 	else
2319 		/* Because we don't lock the socket,
2320 		 * we might find a transient negative value.
2321 		 */
2322 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2323 
2324 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2325 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2326 		i, src, srcp, dest, destp, state,
2327 		tp->write_seq - tp->snd_una,
2328 		rx_queue,
2329 		timer_active,
2330 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2331 		icsk->icsk_retransmits,
2332 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2333 		icsk->icsk_probes_out,
2334 		sock_i_ino(sk),
2335 		refcount_read(&sk->sk_refcnt), sk,
2336 		jiffies_to_clock_t(icsk->icsk_rto),
2337 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2338 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2339 		tp->snd_cwnd,
2340 		state == TCP_LISTEN ?
2341 		    fastopenq->max_qlen :
2342 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2343 }
2344 
2345 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2346 			       struct seq_file *f, int i)
2347 {
2348 	long delta = tw->tw_timer.expires - jiffies;
2349 	__be32 dest, src;
2350 	__u16 destp, srcp;
2351 
2352 	dest  = tw->tw_daddr;
2353 	src   = tw->tw_rcv_saddr;
2354 	destp = ntohs(tw->tw_dport);
2355 	srcp  = ntohs(tw->tw_sport);
2356 
2357 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2358 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2359 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2360 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2361 		refcount_read(&tw->tw_refcnt), tw);
2362 }
2363 
2364 #define TMPSZ 150
2365 
2366 static int tcp4_seq_show(struct seq_file *seq, void *v)
2367 {
2368 	struct tcp_iter_state *st;
2369 	struct sock *sk = v;
2370 
2371 	seq_setwidth(seq, TMPSZ - 1);
2372 	if (v == SEQ_START_TOKEN) {
2373 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2374 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2375 			   "inode");
2376 		goto out;
2377 	}
2378 	st = seq->private;
2379 
2380 	if (sk->sk_state == TCP_TIME_WAIT)
2381 		get_timewait4_sock(v, seq, st->num);
2382 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2383 		get_openreq4(v, seq, st->num);
2384 	else
2385 		get_tcp4_sock(v, seq, st->num);
2386 out:
2387 	seq_pad(seq, '\n');
2388 	return 0;
2389 }
2390 
2391 static const struct seq_operations tcp4_seq_ops = {
2392 	.show		= tcp4_seq_show,
2393 	.start		= tcp_seq_start,
2394 	.next		= tcp_seq_next,
2395 	.stop		= tcp_seq_stop,
2396 };
2397 
2398 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2399 	.family		= AF_INET,
2400 };
2401 
2402 static int __net_init tcp4_proc_init_net(struct net *net)
2403 {
2404 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2405 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2406 		return -ENOMEM;
2407 	return 0;
2408 }
2409 
2410 static void __net_exit tcp4_proc_exit_net(struct net *net)
2411 {
2412 	remove_proc_entry("tcp", net->proc_net);
2413 }
2414 
2415 static struct pernet_operations tcp4_net_ops = {
2416 	.init = tcp4_proc_init_net,
2417 	.exit = tcp4_proc_exit_net,
2418 };
2419 
2420 int __init tcp4_proc_init(void)
2421 {
2422 	return register_pernet_subsys(&tcp4_net_ops);
2423 }
2424 
2425 void tcp4_proc_exit(void)
2426 {
2427 	unregister_pernet_subsys(&tcp4_net_ops);
2428 }
2429 #endif /* CONFIG_PROC_FS */
2430 
2431 struct proto tcp_prot = {
2432 	.name			= "TCP",
2433 	.owner			= THIS_MODULE,
2434 	.close			= tcp_close,
2435 	.pre_connect		= tcp_v4_pre_connect,
2436 	.connect		= tcp_v4_connect,
2437 	.disconnect		= tcp_disconnect,
2438 	.accept			= inet_csk_accept,
2439 	.ioctl			= tcp_ioctl,
2440 	.init			= tcp_v4_init_sock,
2441 	.destroy		= tcp_v4_destroy_sock,
2442 	.shutdown		= tcp_shutdown,
2443 	.setsockopt		= tcp_setsockopt,
2444 	.getsockopt		= tcp_getsockopt,
2445 	.keepalive		= tcp_set_keepalive,
2446 	.recvmsg		= tcp_recvmsg,
2447 	.sendmsg		= tcp_sendmsg,
2448 	.sendpage		= tcp_sendpage,
2449 	.backlog_rcv		= tcp_v4_do_rcv,
2450 	.release_cb		= tcp_release_cb,
2451 	.hash			= inet_hash,
2452 	.unhash			= inet_unhash,
2453 	.get_port		= inet_csk_get_port,
2454 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2455 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2456 	.stream_memory_free	= tcp_stream_memory_free,
2457 	.sockets_allocated	= &tcp_sockets_allocated,
2458 	.orphan_count		= &tcp_orphan_count,
2459 	.memory_allocated	= &tcp_memory_allocated,
2460 	.memory_pressure	= &tcp_memory_pressure,
2461 	.sysctl_mem		= sysctl_tcp_mem,
2462 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2463 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2464 	.max_header		= MAX_TCP_HEADER,
2465 	.obj_size		= sizeof(struct tcp_sock),
2466 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2467 	.twsk_prot		= &tcp_timewait_sock_ops,
2468 	.rsk_prot		= &tcp_request_sock_ops,
2469 	.h.hashinfo		= &tcp_hashinfo,
2470 	.no_autobind		= true,
2471 #ifdef CONFIG_COMPAT
2472 	.compat_setsockopt	= compat_tcp_setsockopt,
2473 	.compat_getsockopt	= compat_tcp_getsockopt,
2474 #endif
2475 	.diag_destroy		= tcp_abort,
2476 };
2477 EXPORT_SYMBOL(tcp_prot);
2478 
2479 static void __net_exit tcp_sk_exit(struct net *net)
2480 {
2481 	int cpu;
2482 
2483 	module_put(net->ipv4.tcp_congestion_control->owner);
2484 
2485 	for_each_possible_cpu(cpu)
2486 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2487 	free_percpu(net->ipv4.tcp_sk);
2488 }
2489 
2490 static int __net_init tcp_sk_init(struct net *net)
2491 {
2492 	int res, cpu, cnt;
2493 
2494 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2495 	if (!net->ipv4.tcp_sk)
2496 		return -ENOMEM;
2497 
2498 	for_each_possible_cpu(cpu) {
2499 		struct sock *sk;
2500 
2501 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2502 					   IPPROTO_TCP, net);
2503 		if (res)
2504 			goto fail;
2505 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2506 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2507 	}
2508 
2509 	net->ipv4.sysctl_tcp_ecn = 2;
2510 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2511 
2512 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2513 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2514 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2515 
2516 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2517 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2518 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2519 
2520 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2521 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2522 	net->ipv4.sysctl_tcp_syncookies = 1;
2523 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2524 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2525 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2526 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2527 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2528 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2529 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2530 
2531 	cnt = tcp_hashinfo.ehash_mask + 1;
2532 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2533 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2534 
2535 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2536 	net->ipv4.sysctl_tcp_sack = 1;
2537 	net->ipv4.sysctl_tcp_window_scaling = 1;
2538 	net->ipv4.sysctl_tcp_timestamps = 1;
2539 	net->ipv4.sysctl_tcp_early_retrans = 3;
2540 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2541 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2542 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2543 	net->ipv4.sysctl_tcp_max_reordering = 300;
2544 	net->ipv4.sysctl_tcp_dsack = 1;
2545 	net->ipv4.sysctl_tcp_app_win = 31;
2546 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2547 	net->ipv4.sysctl_tcp_frto = 2;
2548 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2549 	/* This limits the percentage of the congestion window which we
2550 	 * will allow a single TSO frame to consume.  Building TSO frames
2551 	 * which are too large can cause TCP streams to be bursty.
2552 	 */
2553 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2554 	/* Default TSQ limit of four TSO segments */
2555 	net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2556 	/* rfc5961 challenge ack rate limiting */
2557 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2558 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2559 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2560 	net->ipv4.sysctl_tcp_autocorking = 1;
2561 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2562 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2563 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2564 	if (net != &init_net) {
2565 		memcpy(net->ipv4.sysctl_tcp_rmem,
2566 		       init_net.ipv4.sysctl_tcp_rmem,
2567 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2568 		memcpy(net->ipv4.sysctl_tcp_wmem,
2569 		       init_net.ipv4.sysctl_tcp_wmem,
2570 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2571 	}
2572 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2573 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2574 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2575 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2576 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2577 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2578 
2579 	/* Reno is always built in */
2580 	if (!net_eq(net, &init_net) &&
2581 	    try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2582 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2583 	else
2584 		net->ipv4.tcp_congestion_control = &tcp_reno;
2585 
2586 	return 0;
2587 fail:
2588 	tcp_sk_exit(net);
2589 
2590 	return res;
2591 }
2592 
2593 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2594 {
2595 	struct net *net;
2596 
2597 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2598 
2599 	list_for_each_entry(net, net_exit_list, exit_list)
2600 		tcp_fastopen_ctx_destroy(net);
2601 }
2602 
2603 static struct pernet_operations __net_initdata tcp_sk_ops = {
2604        .init	   = tcp_sk_init,
2605        .exit	   = tcp_sk_exit,
2606        .exit_batch = tcp_sk_exit_batch,
2607 };
2608 
2609 void __init tcp_v4_init(void)
2610 {
2611 	if (register_pernet_subsys(&tcp_sk_ops))
2612 		panic("Failed to create the TCP control socket.\n");
2613 }
2614