xref: /linux/net/ipv4/tcp_ipv4.c (revision 9d106c6dd81bb26ad7fc3ee89cb1d62557c8e2c9)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82 
83 #include <trace/events/tcp.h>
84 
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89 
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92 
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 	return secure_tcp_seq(ip_hdr(skb)->daddr,
96 			      ip_hdr(skb)->saddr,
97 			      tcp_hdr(skb)->dest,
98 			      tcp_hdr(skb)->source);
99 }
100 
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105 
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112 
113 	if (reuse == 2) {
114 		/* Still does not detect *everything* that goes through
115 		 * lo, since we require a loopback src or dst address
116 		 * or direct binding to 'lo' interface.
117 		 */
118 		bool loopback = false;
119 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 			loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 		if (tw->tw_family == AF_INET6) {
123 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
126 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
127 				loopback = true;
128 		} else
129 #endif
130 		{
131 			if (ipv4_is_loopback(tw->tw_daddr) ||
132 			    ipv4_is_loopback(tw->tw_rcv_saddr))
133 				loopback = true;
134 		}
135 		if (!loopback)
136 			reuse = 0;
137 	}
138 
139 	/* With PAWS, it is safe from the viewpoint
140 	   of data integrity. Even without PAWS it is safe provided sequence
141 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142 
143 	   Actually, the idea is close to VJ's one, only timestamp cache is
144 	   held not per host, but per port pair and TW bucket is used as state
145 	   holder.
146 
147 	   If TW bucket has been already destroyed we fall back to VJ's scheme
148 	   and use initial timestamp retrieved from peer table.
149 	 */
150 	if (tcptw->tw_ts_recent_stamp &&
151 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
152 					    tcptw->tw_ts_recent_stamp)))) {
153 		/* In case of repair and re-using TIME-WAIT sockets we still
154 		 * want to be sure that it is safe as above but honor the
155 		 * sequence numbers and time stamps set as part of the repair
156 		 * process.
157 		 *
158 		 * Without this check re-using a TIME-WAIT socket with TCP
159 		 * repair would accumulate a -1 on the repair assigned
160 		 * sequence number. The first time it is reused the sequence
161 		 * is -1, the second time -2, etc. This fixes that issue
162 		 * without appearing to create any others.
163 		 */
164 		if (likely(!tp->repair)) {
165 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
166 
167 			if (!seq)
168 				seq = 1;
169 			WRITE_ONCE(tp->write_seq, seq);
170 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
171 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 		}
173 		sock_hold(sktw);
174 		return 1;
175 	}
176 
177 	return 0;
178 }
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180 
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 			      int addr_len)
183 {
184 	/* This check is replicated from tcp_v4_connect() and intended to
185 	 * prevent BPF program called below from accessing bytes that are out
186 	 * of the bound specified by user in addr_len.
187 	 */
188 	if (addr_len < sizeof(struct sockaddr_in))
189 		return -EINVAL;
190 
191 	sock_owned_by_me(sk);
192 
193 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194 }
195 
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198 {
199 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 	struct inet_sock *inet = inet_sk(sk);
201 	struct tcp_sock *tp = tcp_sk(sk);
202 	__be16 orig_sport, orig_dport;
203 	__be32 daddr, nexthop;
204 	struct flowi4 *fl4;
205 	struct rtable *rt;
206 	int err;
207 	struct ip_options_rcu *inet_opt;
208 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
209 
210 	if (addr_len < sizeof(struct sockaddr_in))
211 		return -EINVAL;
212 
213 	if (usin->sin_family != AF_INET)
214 		return -EAFNOSUPPORT;
215 
216 	nexthop = daddr = usin->sin_addr.s_addr;
217 	inet_opt = rcu_dereference_protected(inet->inet_opt,
218 					     lockdep_sock_is_held(sk));
219 	if (inet_opt && inet_opt->opt.srr) {
220 		if (!daddr)
221 			return -EINVAL;
222 		nexthop = inet_opt->opt.faddr;
223 	}
224 
225 	orig_sport = inet->inet_sport;
226 	orig_dport = usin->sin_port;
227 	fl4 = &inet->cork.fl.u.ip4;
228 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 			      IPPROTO_TCP,
231 			      orig_sport, orig_dport, sk);
232 	if (IS_ERR(rt)) {
233 		err = PTR_ERR(rt);
234 		if (err == -ENETUNREACH)
235 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
236 		return err;
237 	}
238 
239 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 		ip_rt_put(rt);
241 		return -ENETUNREACH;
242 	}
243 
244 	if (!inet_opt || !inet_opt->opt.srr)
245 		daddr = fl4->daddr;
246 
247 	if (!inet->inet_saddr)
248 		inet->inet_saddr = fl4->saddr;
249 	sk_rcv_saddr_set(sk, inet->inet_saddr);
250 
251 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 		/* Reset inherited state */
253 		tp->rx_opt.ts_recent	   = 0;
254 		tp->rx_opt.ts_recent_stamp = 0;
255 		if (likely(!tp->repair))
256 			WRITE_ONCE(tp->write_seq, 0);
257 	}
258 
259 	inet->inet_dport = usin->sin_port;
260 	sk_daddr_set(sk, daddr);
261 
262 	inet_csk(sk)->icsk_ext_hdr_len = 0;
263 	if (inet_opt)
264 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
265 
266 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
267 
268 	/* Socket identity is still unknown (sport may be zero).
269 	 * However we set state to SYN-SENT and not releasing socket
270 	 * lock select source port, enter ourselves into the hash tables and
271 	 * complete initialization after this.
272 	 */
273 	tcp_set_state(sk, TCP_SYN_SENT);
274 	err = inet_hash_connect(tcp_death_row, sk);
275 	if (err)
276 		goto failure;
277 
278 	sk_set_txhash(sk);
279 
280 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 			       inet->inet_sport, inet->inet_dport, sk);
282 	if (IS_ERR(rt)) {
283 		err = PTR_ERR(rt);
284 		rt = NULL;
285 		goto failure;
286 	}
287 	/* OK, now commit destination to socket.  */
288 	sk->sk_gso_type = SKB_GSO_TCPV4;
289 	sk_setup_caps(sk, &rt->dst);
290 	rt = NULL;
291 
292 	if (likely(!tp->repair)) {
293 		if (!tp->write_seq)
294 			WRITE_ONCE(tp->write_seq,
295 				   secure_tcp_seq(inet->inet_saddr,
296 						  inet->inet_daddr,
297 						  inet->inet_sport,
298 						  usin->sin_port));
299 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
300 						 inet->inet_saddr,
301 						 inet->inet_daddr);
302 	}
303 
304 	inet->inet_id = prandom_u32();
305 
306 	if (tcp_fastopen_defer_connect(sk, &err))
307 		return err;
308 	if (err)
309 		goto failure;
310 
311 	err = tcp_connect(sk);
312 
313 	if (err)
314 		goto failure;
315 
316 	return 0;
317 
318 failure:
319 	/*
320 	 * This unhashes the socket and releases the local port,
321 	 * if necessary.
322 	 */
323 	tcp_set_state(sk, TCP_CLOSE);
324 	ip_rt_put(rt);
325 	sk->sk_route_caps = 0;
326 	inet->inet_dport = 0;
327 	return err;
328 }
329 EXPORT_SYMBOL(tcp_v4_connect);
330 
331 /*
332  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333  * It can be called through tcp_release_cb() if socket was owned by user
334  * at the time tcp_v4_err() was called to handle ICMP message.
335  */
336 void tcp_v4_mtu_reduced(struct sock *sk)
337 {
338 	struct inet_sock *inet = inet_sk(sk);
339 	struct dst_entry *dst;
340 	u32 mtu;
341 
342 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
343 		return;
344 	mtu = tcp_sk(sk)->mtu_info;
345 	dst = inet_csk_update_pmtu(sk, mtu);
346 	if (!dst)
347 		return;
348 
349 	/* Something is about to be wrong... Remember soft error
350 	 * for the case, if this connection will not able to recover.
351 	 */
352 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 		sk->sk_err_soft = EMSGSIZE;
354 
355 	mtu = dst_mtu(dst);
356 
357 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
358 	    ip_sk_accept_pmtu(sk) &&
359 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
360 		tcp_sync_mss(sk, mtu);
361 
362 		/* Resend the TCP packet because it's
363 		 * clear that the old packet has been
364 		 * dropped. This is the new "fast" path mtu
365 		 * discovery.
366 		 */
367 		tcp_simple_retransmit(sk);
368 	} /* else let the usual retransmit timer handle it */
369 }
370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
371 
372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
373 {
374 	struct dst_entry *dst = __sk_dst_check(sk, 0);
375 
376 	if (dst)
377 		dst->ops->redirect(dst, sk, skb);
378 }
379 
380 
381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
383 {
384 	struct request_sock *req = inet_reqsk(sk);
385 	struct net *net = sock_net(sk);
386 
387 	/* ICMPs are not backlogged, hence we cannot get
388 	 * an established socket here.
389 	 */
390 	if (seq != tcp_rsk(req)->snt_isn) {
391 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
392 	} else if (abort) {
393 		/*
394 		 * Still in SYN_RECV, just remove it silently.
395 		 * There is no good way to pass the error to the newly
396 		 * created socket, and POSIX does not want network
397 		 * errors returned from accept().
398 		 */
399 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
400 		tcp_listendrop(req->rsk_listener);
401 	}
402 	reqsk_put(req);
403 }
404 EXPORT_SYMBOL(tcp_req_err);
405 
406 /*
407  * This routine is called by the ICMP module when it gets some
408  * sort of error condition.  If err < 0 then the socket should
409  * be closed and the error returned to the user.  If err > 0
410  * it's just the icmp type << 8 | icmp code.  After adjustment
411  * header points to the first 8 bytes of the tcp header.  We need
412  * to find the appropriate port.
413  *
414  * The locking strategy used here is very "optimistic". When
415  * someone else accesses the socket the ICMP is just dropped
416  * and for some paths there is no check at all.
417  * A more general error queue to queue errors for later handling
418  * is probably better.
419  *
420  */
421 
422 int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
423 {
424 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
425 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
426 	struct inet_connection_sock *icsk;
427 	struct tcp_sock *tp;
428 	struct inet_sock *inet;
429 	const int type = icmp_hdr(icmp_skb)->type;
430 	const int code = icmp_hdr(icmp_skb)->code;
431 	struct sock *sk;
432 	struct sk_buff *skb;
433 	struct request_sock *fastopen;
434 	u32 seq, snd_una;
435 	s32 remaining;
436 	u32 delta_us;
437 	int err;
438 	struct net *net = dev_net(icmp_skb->dev);
439 
440 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
441 				       th->dest, iph->saddr, ntohs(th->source),
442 				       inet_iif(icmp_skb), 0);
443 	if (!sk) {
444 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
445 		return -ENOENT;
446 	}
447 	if (sk->sk_state == TCP_TIME_WAIT) {
448 		inet_twsk_put(inet_twsk(sk));
449 		return 0;
450 	}
451 	seq = ntohl(th->seq);
452 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
453 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
454 				     type == ICMP_TIME_EXCEEDED ||
455 				     (type == ICMP_DEST_UNREACH &&
456 				      (code == ICMP_NET_UNREACH ||
457 				       code == ICMP_HOST_UNREACH)));
458 		return 0;
459 	}
460 
461 	bh_lock_sock(sk);
462 	/* If too many ICMPs get dropped on busy
463 	 * servers this needs to be solved differently.
464 	 * We do take care of PMTU discovery (RFC1191) special case :
465 	 * we can receive locally generated ICMP messages while socket is held.
466 	 */
467 	if (sock_owned_by_user(sk)) {
468 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
469 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
470 	}
471 	if (sk->sk_state == TCP_CLOSE)
472 		goto out;
473 
474 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
475 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
476 		goto out;
477 	}
478 
479 	icsk = inet_csk(sk);
480 	tp = tcp_sk(sk);
481 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
482 	fastopen = rcu_dereference(tp->fastopen_rsk);
483 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
484 	if (sk->sk_state != TCP_LISTEN &&
485 	    !between(seq, snd_una, tp->snd_nxt)) {
486 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
487 		goto out;
488 	}
489 
490 	switch (type) {
491 	case ICMP_REDIRECT:
492 		if (!sock_owned_by_user(sk))
493 			do_redirect(icmp_skb, sk);
494 		goto out;
495 	case ICMP_SOURCE_QUENCH:
496 		/* Just silently ignore these. */
497 		goto out;
498 	case ICMP_PARAMETERPROB:
499 		err = EPROTO;
500 		break;
501 	case ICMP_DEST_UNREACH:
502 		if (code > NR_ICMP_UNREACH)
503 			goto out;
504 
505 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
506 			/* We are not interested in TCP_LISTEN and open_requests
507 			 * (SYN-ACKs send out by Linux are always <576bytes so
508 			 * they should go through unfragmented).
509 			 */
510 			if (sk->sk_state == TCP_LISTEN)
511 				goto out;
512 
513 			tp->mtu_info = info;
514 			if (!sock_owned_by_user(sk)) {
515 				tcp_v4_mtu_reduced(sk);
516 			} else {
517 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
518 					sock_hold(sk);
519 			}
520 			goto out;
521 		}
522 
523 		err = icmp_err_convert[code].errno;
524 		/* check if icmp_skb allows revert of backoff
525 		 * (see draft-zimmermann-tcp-lcd) */
526 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
527 			break;
528 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
529 		    !icsk->icsk_backoff || fastopen)
530 			break;
531 
532 		if (sock_owned_by_user(sk))
533 			break;
534 
535 		skb = tcp_rtx_queue_head(sk);
536 		if (WARN_ON_ONCE(!skb))
537 			break;
538 
539 		icsk->icsk_backoff--;
540 		icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
541 					       TCP_TIMEOUT_INIT;
542 		icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
543 
544 
545 		tcp_mstamp_refresh(tp);
546 		delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
547 		remaining = icsk->icsk_rto -
548 			    usecs_to_jiffies(delta_us);
549 
550 		if (remaining > 0) {
551 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
552 						  remaining, TCP_RTO_MAX);
553 		} else {
554 			/* RTO revert clocked out retransmission.
555 			 * Will retransmit now */
556 			tcp_retransmit_timer(sk);
557 		}
558 
559 		break;
560 	case ICMP_TIME_EXCEEDED:
561 		err = EHOSTUNREACH;
562 		break;
563 	default:
564 		goto out;
565 	}
566 
567 	switch (sk->sk_state) {
568 	case TCP_SYN_SENT:
569 	case TCP_SYN_RECV:
570 		/* Only in fast or simultaneous open. If a fast open socket is
571 		 * is already accepted it is treated as a connected one below.
572 		 */
573 		if (fastopen && !fastopen->sk)
574 			break;
575 
576 		if (!sock_owned_by_user(sk)) {
577 			sk->sk_err = err;
578 
579 			sk->sk_error_report(sk);
580 
581 			tcp_done(sk);
582 		} else {
583 			sk->sk_err_soft = err;
584 		}
585 		goto out;
586 	}
587 
588 	/* If we've already connected we will keep trying
589 	 * until we time out, or the user gives up.
590 	 *
591 	 * rfc1122 4.2.3.9 allows to consider as hard errors
592 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
593 	 * but it is obsoleted by pmtu discovery).
594 	 *
595 	 * Note, that in modern internet, where routing is unreliable
596 	 * and in each dark corner broken firewalls sit, sending random
597 	 * errors ordered by their masters even this two messages finally lose
598 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
599 	 *
600 	 * Now we are in compliance with RFCs.
601 	 *							--ANK (980905)
602 	 */
603 
604 	inet = inet_sk(sk);
605 	if (!sock_owned_by_user(sk) && inet->recverr) {
606 		sk->sk_err = err;
607 		sk->sk_error_report(sk);
608 	} else	{ /* Only an error on timeout */
609 		sk->sk_err_soft = err;
610 	}
611 
612 out:
613 	bh_unlock_sock(sk);
614 	sock_put(sk);
615 	return 0;
616 }
617 
618 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
619 {
620 	struct tcphdr *th = tcp_hdr(skb);
621 
622 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
623 	skb->csum_start = skb_transport_header(skb) - skb->head;
624 	skb->csum_offset = offsetof(struct tcphdr, check);
625 }
626 
627 /* This routine computes an IPv4 TCP checksum. */
628 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
629 {
630 	const struct inet_sock *inet = inet_sk(sk);
631 
632 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
633 }
634 EXPORT_SYMBOL(tcp_v4_send_check);
635 
636 /*
637  *	This routine will send an RST to the other tcp.
638  *
639  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
640  *		      for reset.
641  *	Answer: if a packet caused RST, it is not for a socket
642  *		existing in our system, if it is matched to a socket,
643  *		it is just duplicate segment or bug in other side's TCP.
644  *		So that we build reply only basing on parameters
645  *		arrived with segment.
646  *	Exception: precedence violation. We do not implement it in any case.
647  */
648 
649 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
650 {
651 	const struct tcphdr *th = tcp_hdr(skb);
652 	struct {
653 		struct tcphdr th;
654 #ifdef CONFIG_TCP_MD5SIG
655 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
656 #endif
657 	} rep;
658 	struct ip_reply_arg arg;
659 #ifdef CONFIG_TCP_MD5SIG
660 	struct tcp_md5sig_key *key = NULL;
661 	const __u8 *hash_location = NULL;
662 	unsigned char newhash[16];
663 	int genhash;
664 	struct sock *sk1 = NULL;
665 #endif
666 	u64 transmit_time = 0;
667 	struct sock *ctl_sk;
668 	struct net *net;
669 
670 	/* Never send a reset in response to a reset. */
671 	if (th->rst)
672 		return;
673 
674 	/* If sk not NULL, it means we did a successful lookup and incoming
675 	 * route had to be correct. prequeue might have dropped our dst.
676 	 */
677 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
678 		return;
679 
680 	/* Swap the send and the receive. */
681 	memset(&rep, 0, sizeof(rep));
682 	rep.th.dest   = th->source;
683 	rep.th.source = th->dest;
684 	rep.th.doff   = sizeof(struct tcphdr) / 4;
685 	rep.th.rst    = 1;
686 
687 	if (th->ack) {
688 		rep.th.seq = th->ack_seq;
689 	} else {
690 		rep.th.ack = 1;
691 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
692 				       skb->len - (th->doff << 2));
693 	}
694 
695 	memset(&arg, 0, sizeof(arg));
696 	arg.iov[0].iov_base = (unsigned char *)&rep;
697 	arg.iov[0].iov_len  = sizeof(rep.th);
698 
699 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
700 #ifdef CONFIG_TCP_MD5SIG
701 	rcu_read_lock();
702 	hash_location = tcp_parse_md5sig_option(th);
703 	if (sk && sk_fullsock(sk)) {
704 		const union tcp_md5_addr *addr;
705 		int l3index;
706 
707 		/* sdif set, means packet ingressed via a device
708 		 * in an L3 domain and inet_iif is set to it.
709 		 */
710 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
711 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
712 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
713 	} else if (hash_location) {
714 		const union tcp_md5_addr *addr;
715 		int sdif = tcp_v4_sdif(skb);
716 		int dif = inet_iif(skb);
717 		int l3index;
718 
719 		/*
720 		 * active side is lost. Try to find listening socket through
721 		 * source port, and then find md5 key through listening socket.
722 		 * we are not loose security here:
723 		 * Incoming packet is checked with md5 hash with finding key,
724 		 * no RST generated if md5 hash doesn't match.
725 		 */
726 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
727 					     ip_hdr(skb)->saddr,
728 					     th->source, ip_hdr(skb)->daddr,
729 					     ntohs(th->source), dif, sdif);
730 		/* don't send rst if it can't find key */
731 		if (!sk1)
732 			goto out;
733 
734 		/* sdif set, means packet ingressed via a device
735 		 * in an L3 domain and dif is set to it.
736 		 */
737 		l3index = sdif ? dif : 0;
738 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
739 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
740 		if (!key)
741 			goto out;
742 
743 
744 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
745 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
746 			goto out;
747 
748 	}
749 
750 	if (key) {
751 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
752 				   (TCPOPT_NOP << 16) |
753 				   (TCPOPT_MD5SIG << 8) |
754 				   TCPOLEN_MD5SIG);
755 		/* Update length and the length the header thinks exists */
756 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
757 		rep.th.doff = arg.iov[0].iov_len / 4;
758 
759 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
760 				     key, ip_hdr(skb)->saddr,
761 				     ip_hdr(skb)->daddr, &rep.th);
762 	}
763 #endif
764 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
765 				      ip_hdr(skb)->saddr, /* XXX */
766 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
767 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
768 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
769 
770 	/* When socket is gone, all binding information is lost.
771 	 * routing might fail in this case. No choice here, if we choose to force
772 	 * input interface, we will misroute in case of asymmetric route.
773 	 */
774 	if (sk) {
775 		arg.bound_dev_if = sk->sk_bound_dev_if;
776 		if (sk_fullsock(sk))
777 			trace_tcp_send_reset(sk, skb);
778 	}
779 
780 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
781 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
782 
783 	arg.tos = ip_hdr(skb)->tos;
784 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
785 	local_bh_disable();
786 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
787 	if (sk) {
788 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
789 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
790 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
791 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
792 		transmit_time = tcp_transmit_time(sk);
793 	}
794 	ip_send_unicast_reply(ctl_sk,
795 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
796 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
797 			      &arg, arg.iov[0].iov_len,
798 			      transmit_time);
799 
800 	ctl_sk->sk_mark = 0;
801 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
802 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
803 	local_bh_enable();
804 
805 #ifdef CONFIG_TCP_MD5SIG
806 out:
807 	rcu_read_unlock();
808 #endif
809 }
810 
811 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
812    outside socket context is ugly, certainly. What can I do?
813  */
814 
815 static void tcp_v4_send_ack(const struct sock *sk,
816 			    struct sk_buff *skb, u32 seq, u32 ack,
817 			    u32 win, u32 tsval, u32 tsecr, int oif,
818 			    struct tcp_md5sig_key *key,
819 			    int reply_flags, u8 tos)
820 {
821 	const struct tcphdr *th = tcp_hdr(skb);
822 	struct {
823 		struct tcphdr th;
824 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
825 #ifdef CONFIG_TCP_MD5SIG
826 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
827 #endif
828 			];
829 	} rep;
830 	struct net *net = sock_net(sk);
831 	struct ip_reply_arg arg;
832 	struct sock *ctl_sk;
833 	u64 transmit_time;
834 
835 	memset(&rep.th, 0, sizeof(struct tcphdr));
836 	memset(&arg, 0, sizeof(arg));
837 
838 	arg.iov[0].iov_base = (unsigned char *)&rep;
839 	arg.iov[0].iov_len  = sizeof(rep.th);
840 	if (tsecr) {
841 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
842 				   (TCPOPT_TIMESTAMP << 8) |
843 				   TCPOLEN_TIMESTAMP);
844 		rep.opt[1] = htonl(tsval);
845 		rep.opt[2] = htonl(tsecr);
846 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
847 	}
848 
849 	/* Swap the send and the receive. */
850 	rep.th.dest    = th->source;
851 	rep.th.source  = th->dest;
852 	rep.th.doff    = arg.iov[0].iov_len / 4;
853 	rep.th.seq     = htonl(seq);
854 	rep.th.ack_seq = htonl(ack);
855 	rep.th.ack     = 1;
856 	rep.th.window  = htons(win);
857 
858 #ifdef CONFIG_TCP_MD5SIG
859 	if (key) {
860 		int offset = (tsecr) ? 3 : 0;
861 
862 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
863 					  (TCPOPT_NOP << 16) |
864 					  (TCPOPT_MD5SIG << 8) |
865 					  TCPOLEN_MD5SIG);
866 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
867 		rep.th.doff = arg.iov[0].iov_len/4;
868 
869 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
870 				    key, ip_hdr(skb)->saddr,
871 				    ip_hdr(skb)->daddr, &rep.th);
872 	}
873 #endif
874 	arg.flags = reply_flags;
875 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
876 				      ip_hdr(skb)->saddr, /* XXX */
877 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
878 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
879 	if (oif)
880 		arg.bound_dev_if = oif;
881 	arg.tos = tos;
882 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
883 	local_bh_disable();
884 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
885 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
886 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
887 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
888 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
889 	transmit_time = tcp_transmit_time(sk);
890 	ip_send_unicast_reply(ctl_sk,
891 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
892 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
893 			      &arg, arg.iov[0].iov_len,
894 			      transmit_time);
895 
896 	ctl_sk->sk_mark = 0;
897 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
898 	local_bh_enable();
899 }
900 
901 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
902 {
903 	struct inet_timewait_sock *tw = inet_twsk(sk);
904 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
905 
906 	tcp_v4_send_ack(sk, skb,
907 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
908 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
909 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
910 			tcptw->tw_ts_recent,
911 			tw->tw_bound_dev_if,
912 			tcp_twsk_md5_key(tcptw),
913 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
914 			tw->tw_tos
915 			);
916 
917 	inet_twsk_put(tw);
918 }
919 
920 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
921 				  struct request_sock *req)
922 {
923 	const union tcp_md5_addr *addr;
924 	int l3index;
925 
926 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
927 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
928 	 */
929 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
930 					     tcp_sk(sk)->snd_nxt;
931 
932 	/* RFC 7323 2.3
933 	 * The window field (SEG.WND) of every outgoing segment, with the
934 	 * exception of <SYN> segments, MUST be right-shifted by
935 	 * Rcv.Wind.Shift bits:
936 	 */
937 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
938 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
939 	tcp_v4_send_ack(sk, skb, seq,
940 			tcp_rsk(req)->rcv_nxt,
941 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
942 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
943 			req->ts_recent,
944 			0,
945 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
946 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
947 			ip_hdr(skb)->tos);
948 }
949 
950 /*
951  *	Send a SYN-ACK after having received a SYN.
952  *	This still operates on a request_sock only, not on a big
953  *	socket.
954  */
955 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
956 			      struct flowi *fl,
957 			      struct request_sock *req,
958 			      struct tcp_fastopen_cookie *foc,
959 			      enum tcp_synack_type synack_type)
960 {
961 	const struct inet_request_sock *ireq = inet_rsk(req);
962 	struct flowi4 fl4;
963 	int err = -1;
964 	struct sk_buff *skb;
965 
966 	/* First, grab a route. */
967 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
968 		return -1;
969 
970 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
971 
972 	if (skb) {
973 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
974 
975 		rcu_read_lock();
976 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
977 					    ireq->ir_rmt_addr,
978 					    rcu_dereference(ireq->ireq_opt));
979 		rcu_read_unlock();
980 		err = net_xmit_eval(err);
981 	}
982 
983 	return err;
984 }
985 
986 /*
987  *	IPv4 request_sock destructor.
988  */
989 static void tcp_v4_reqsk_destructor(struct request_sock *req)
990 {
991 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
992 }
993 
994 #ifdef CONFIG_TCP_MD5SIG
995 /*
996  * RFC2385 MD5 checksumming requires a mapping of
997  * IP address->MD5 Key.
998  * We need to maintain these in the sk structure.
999  */
1000 
1001 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1002 EXPORT_SYMBOL(tcp_md5_needed);
1003 
1004 /* Find the Key structure for an address.  */
1005 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1006 					   const union tcp_md5_addr *addr,
1007 					   int family)
1008 {
1009 	const struct tcp_sock *tp = tcp_sk(sk);
1010 	struct tcp_md5sig_key *key;
1011 	const struct tcp_md5sig_info *md5sig;
1012 	__be32 mask;
1013 	struct tcp_md5sig_key *best_match = NULL;
1014 	bool match;
1015 
1016 	/* caller either holds rcu_read_lock() or socket lock */
1017 	md5sig = rcu_dereference_check(tp->md5sig_info,
1018 				       lockdep_sock_is_held(sk));
1019 	if (!md5sig)
1020 		return NULL;
1021 
1022 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1023 				 lockdep_sock_is_held(sk)) {
1024 		if (key->family != family)
1025 			continue;
1026 		if (key->l3index && key->l3index != l3index)
1027 			continue;
1028 		if (family == AF_INET) {
1029 			mask = inet_make_mask(key->prefixlen);
1030 			match = (key->addr.a4.s_addr & mask) ==
1031 				(addr->a4.s_addr & mask);
1032 #if IS_ENABLED(CONFIG_IPV6)
1033 		} else if (family == AF_INET6) {
1034 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1035 						  key->prefixlen);
1036 #endif
1037 		} else {
1038 			match = false;
1039 		}
1040 
1041 		if (match && (!best_match ||
1042 			      key->prefixlen > best_match->prefixlen))
1043 			best_match = key;
1044 	}
1045 	return best_match;
1046 }
1047 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1048 
1049 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1050 						      const union tcp_md5_addr *addr,
1051 						      int family, u8 prefixlen,
1052 						      int l3index)
1053 {
1054 	const struct tcp_sock *tp = tcp_sk(sk);
1055 	struct tcp_md5sig_key *key;
1056 	unsigned int size = sizeof(struct in_addr);
1057 	const struct tcp_md5sig_info *md5sig;
1058 
1059 	/* caller either holds rcu_read_lock() or socket lock */
1060 	md5sig = rcu_dereference_check(tp->md5sig_info,
1061 				       lockdep_sock_is_held(sk));
1062 	if (!md5sig)
1063 		return NULL;
1064 #if IS_ENABLED(CONFIG_IPV6)
1065 	if (family == AF_INET6)
1066 		size = sizeof(struct in6_addr);
1067 #endif
1068 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1069 				 lockdep_sock_is_held(sk)) {
1070 		if (key->family != family)
1071 			continue;
1072 		if (key->l3index && key->l3index != l3index)
1073 			continue;
1074 		if (!memcmp(&key->addr, addr, size) &&
1075 		    key->prefixlen == prefixlen)
1076 			return key;
1077 	}
1078 	return NULL;
1079 }
1080 
1081 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1082 					 const struct sock *addr_sk)
1083 {
1084 	const union tcp_md5_addr *addr;
1085 	int l3index;
1086 
1087 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1088 						 addr_sk->sk_bound_dev_if);
1089 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1090 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1091 }
1092 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1093 
1094 /* This can be called on a newly created socket, from other files */
1095 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1096 		   int family, u8 prefixlen, int l3index,
1097 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1098 {
1099 	/* Add Key to the list */
1100 	struct tcp_md5sig_key *key;
1101 	struct tcp_sock *tp = tcp_sk(sk);
1102 	struct tcp_md5sig_info *md5sig;
1103 
1104 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1105 	if (key) {
1106 		/* Pre-existing entry - just update that one. */
1107 		memcpy(key->key, newkey, newkeylen);
1108 		key->keylen = newkeylen;
1109 		return 0;
1110 	}
1111 
1112 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1113 					   lockdep_sock_is_held(sk));
1114 	if (!md5sig) {
1115 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1116 		if (!md5sig)
1117 			return -ENOMEM;
1118 
1119 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1120 		INIT_HLIST_HEAD(&md5sig->head);
1121 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1122 	}
1123 
1124 	key = sock_kmalloc(sk, sizeof(*key), gfp);
1125 	if (!key)
1126 		return -ENOMEM;
1127 	if (!tcp_alloc_md5sig_pool()) {
1128 		sock_kfree_s(sk, key, sizeof(*key));
1129 		return -ENOMEM;
1130 	}
1131 
1132 	memcpy(key->key, newkey, newkeylen);
1133 	key->keylen = newkeylen;
1134 	key->family = family;
1135 	key->prefixlen = prefixlen;
1136 	key->l3index = l3index;
1137 	memcpy(&key->addr, addr,
1138 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1139 				      sizeof(struct in_addr));
1140 	hlist_add_head_rcu(&key->node, &md5sig->head);
1141 	return 0;
1142 }
1143 EXPORT_SYMBOL(tcp_md5_do_add);
1144 
1145 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1146 		   u8 prefixlen, int l3index)
1147 {
1148 	struct tcp_md5sig_key *key;
1149 
1150 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1151 	if (!key)
1152 		return -ENOENT;
1153 	hlist_del_rcu(&key->node);
1154 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1155 	kfree_rcu(key, rcu);
1156 	return 0;
1157 }
1158 EXPORT_SYMBOL(tcp_md5_do_del);
1159 
1160 static void tcp_clear_md5_list(struct sock *sk)
1161 {
1162 	struct tcp_sock *tp = tcp_sk(sk);
1163 	struct tcp_md5sig_key *key;
1164 	struct hlist_node *n;
1165 	struct tcp_md5sig_info *md5sig;
1166 
1167 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1168 
1169 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1170 		hlist_del_rcu(&key->node);
1171 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1172 		kfree_rcu(key, rcu);
1173 	}
1174 }
1175 
1176 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1177 				 char __user *optval, int optlen)
1178 {
1179 	struct tcp_md5sig cmd;
1180 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1181 	const union tcp_md5_addr *addr;
1182 	u8 prefixlen = 32;
1183 	int l3index = 0;
1184 
1185 	if (optlen < sizeof(cmd))
1186 		return -EINVAL;
1187 
1188 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1189 		return -EFAULT;
1190 
1191 	if (sin->sin_family != AF_INET)
1192 		return -EINVAL;
1193 
1194 	if (optname == TCP_MD5SIG_EXT &&
1195 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1196 		prefixlen = cmd.tcpm_prefixlen;
1197 		if (prefixlen > 32)
1198 			return -EINVAL;
1199 	}
1200 
1201 	if (optname == TCP_MD5SIG_EXT &&
1202 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1203 		struct net_device *dev;
1204 
1205 		rcu_read_lock();
1206 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1207 		if (dev && netif_is_l3_master(dev))
1208 			l3index = dev->ifindex;
1209 
1210 		rcu_read_unlock();
1211 
1212 		/* ok to reference set/not set outside of rcu;
1213 		 * right now device MUST be an L3 master
1214 		 */
1215 		if (!dev || !l3index)
1216 			return -EINVAL;
1217 	}
1218 
1219 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1220 
1221 	if (!cmd.tcpm_keylen)
1222 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1223 
1224 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1225 		return -EINVAL;
1226 
1227 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1228 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1229 }
1230 
1231 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1232 				   __be32 daddr, __be32 saddr,
1233 				   const struct tcphdr *th, int nbytes)
1234 {
1235 	struct tcp4_pseudohdr *bp;
1236 	struct scatterlist sg;
1237 	struct tcphdr *_th;
1238 
1239 	bp = hp->scratch;
1240 	bp->saddr = saddr;
1241 	bp->daddr = daddr;
1242 	bp->pad = 0;
1243 	bp->protocol = IPPROTO_TCP;
1244 	bp->len = cpu_to_be16(nbytes);
1245 
1246 	_th = (struct tcphdr *)(bp + 1);
1247 	memcpy(_th, th, sizeof(*th));
1248 	_th->check = 0;
1249 
1250 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1251 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1252 				sizeof(*bp) + sizeof(*th));
1253 	return crypto_ahash_update(hp->md5_req);
1254 }
1255 
1256 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1257 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1258 {
1259 	struct tcp_md5sig_pool *hp;
1260 	struct ahash_request *req;
1261 
1262 	hp = tcp_get_md5sig_pool();
1263 	if (!hp)
1264 		goto clear_hash_noput;
1265 	req = hp->md5_req;
1266 
1267 	if (crypto_ahash_init(req))
1268 		goto clear_hash;
1269 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1270 		goto clear_hash;
1271 	if (tcp_md5_hash_key(hp, key))
1272 		goto clear_hash;
1273 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1274 	if (crypto_ahash_final(req))
1275 		goto clear_hash;
1276 
1277 	tcp_put_md5sig_pool();
1278 	return 0;
1279 
1280 clear_hash:
1281 	tcp_put_md5sig_pool();
1282 clear_hash_noput:
1283 	memset(md5_hash, 0, 16);
1284 	return 1;
1285 }
1286 
1287 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1288 			const struct sock *sk,
1289 			const struct sk_buff *skb)
1290 {
1291 	struct tcp_md5sig_pool *hp;
1292 	struct ahash_request *req;
1293 	const struct tcphdr *th = tcp_hdr(skb);
1294 	__be32 saddr, daddr;
1295 
1296 	if (sk) { /* valid for establish/request sockets */
1297 		saddr = sk->sk_rcv_saddr;
1298 		daddr = sk->sk_daddr;
1299 	} else {
1300 		const struct iphdr *iph = ip_hdr(skb);
1301 		saddr = iph->saddr;
1302 		daddr = iph->daddr;
1303 	}
1304 
1305 	hp = tcp_get_md5sig_pool();
1306 	if (!hp)
1307 		goto clear_hash_noput;
1308 	req = hp->md5_req;
1309 
1310 	if (crypto_ahash_init(req))
1311 		goto clear_hash;
1312 
1313 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1314 		goto clear_hash;
1315 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1316 		goto clear_hash;
1317 	if (tcp_md5_hash_key(hp, key))
1318 		goto clear_hash;
1319 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1320 	if (crypto_ahash_final(req))
1321 		goto clear_hash;
1322 
1323 	tcp_put_md5sig_pool();
1324 	return 0;
1325 
1326 clear_hash:
1327 	tcp_put_md5sig_pool();
1328 clear_hash_noput:
1329 	memset(md5_hash, 0, 16);
1330 	return 1;
1331 }
1332 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1333 
1334 #endif
1335 
1336 /* Called with rcu_read_lock() */
1337 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1338 				    const struct sk_buff *skb,
1339 				    int dif, int sdif)
1340 {
1341 #ifdef CONFIG_TCP_MD5SIG
1342 	/*
1343 	 * This gets called for each TCP segment that arrives
1344 	 * so we want to be efficient.
1345 	 * We have 3 drop cases:
1346 	 * o No MD5 hash and one expected.
1347 	 * o MD5 hash and we're not expecting one.
1348 	 * o MD5 hash and its wrong.
1349 	 */
1350 	const __u8 *hash_location = NULL;
1351 	struct tcp_md5sig_key *hash_expected;
1352 	const struct iphdr *iph = ip_hdr(skb);
1353 	const struct tcphdr *th = tcp_hdr(skb);
1354 	const union tcp_md5_addr *addr;
1355 	unsigned char newhash[16];
1356 	int genhash, l3index;
1357 
1358 	/* sdif set, means packet ingressed via a device
1359 	 * in an L3 domain and dif is set to the l3mdev
1360 	 */
1361 	l3index = sdif ? dif : 0;
1362 
1363 	addr = (union tcp_md5_addr *)&iph->saddr;
1364 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1365 	hash_location = tcp_parse_md5sig_option(th);
1366 
1367 	/* We've parsed the options - do we have a hash? */
1368 	if (!hash_expected && !hash_location)
1369 		return false;
1370 
1371 	if (hash_expected && !hash_location) {
1372 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1373 		return true;
1374 	}
1375 
1376 	if (!hash_expected && hash_location) {
1377 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1378 		return true;
1379 	}
1380 
1381 	/* Okay, so this is hash_expected and hash_location -
1382 	 * so we need to calculate the checksum.
1383 	 */
1384 	genhash = tcp_v4_md5_hash_skb(newhash,
1385 				      hash_expected,
1386 				      NULL, skb);
1387 
1388 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1389 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1390 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1391 				     &iph->saddr, ntohs(th->source),
1392 				     &iph->daddr, ntohs(th->dest),
1393 				     genhash ? " tcp_v4_calc_md5_hash failed"
1394 				     : "", l3index);
1395 		return true;
1396 	}
1397 	return false;
1398 #endif
1399 	return false;
1400 }
1401 
1402 static void tcp_v4_init_req(struct request_sock *req,
1403 			    const struct sock *sk_listener,
1404 			    struct sk_buff *skb)
1405 {
1406 	struct inet_request_sock *ireq = inet_rsk(req);
1407 	struct net *net = sock_net(sk_listener);
1408 
1409 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1410 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1411 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1412 }
1413 
1414 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1415 					  struct flowi *fl,
1416 					  const struct request_sock *req)
1417 {
1418 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1419 }
1420 
1421 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1422 	.family		=	PF_INET,
1423 	.obj_size	=	sizeof(struct tcp_request_sock),
1424 	.rtx_syn_ack	=	tcp_rtx_synack,
1425 	.send_ack	=	tcp_v4_reqsk_send_ack,
1426 	.destructor	=	tcp_v4_reqsk_destructor,
1427 	.send_reset	=	tcp_v4_send_reset,
1428 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1429 };
1430 
1431 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1432 	.mss_clamp	=	TCP_MSS_DEFAULT,
1433 #ifdef CONFIG_TCP_MD5SIG
1434 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1435 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1436 #endif
1437 	.init_req	=	tcp_v4_init_req,
1438 #ifdef CONFIG_SYN_COOKIES
1439 	.cookie_init_seq =	cookie_v4_init_sequence,
1440 #endif
1441 	.route_req	=	tcp_v4_route_req,
1442 	.init_seq	=	tcp_v4_init_seq,
1443 	.init_ts_off	=	tcp_v4_init_ts_off,
1444 	.send_synack	=	tcp_v4_send_synack,
1445 };
1446 
1447 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1448 {
1449 	/* Never answer to SYNs send to broadcast or multicast */
1450 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1451 		goto drop;
1452 
1453 	return tcp_conn_request(&tcp_request_sock_ops,
1454 				&tcp_request_sock_ipv4_ops, sk, skb);
1455 
1456 drop:
1457 	tcp_listendrop(sk);
1458 	return 0;
1459 }
1460 EXPORT_SYMBOL(tcp_v4_conn_request);
1461 
1462 
1463 /*
1464  * The three way handshake has completed - we got a valid synack -
1465  * now create the new socket.
1466  */
1467 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1468 				  struct request_sock *req,
1469 				  struct dst_entry *dst,
1470 				  struct request_sock *req_unhash,
1471 				  bool *own_req)
1472 {
1473 	struct inet_request_sock *ireq;
1474 	struct inet_sock *newinet;
1475 	struct tcp_sock *newtp;
1476 	struct sock *newsk;
1477 #ifdef CONFIG_TCP_MD5SIG
1478 	const union tcp_md5_addr *addr;
1479 	struct tcp_md5sig_key *key;
1480 	int l3index;
1481 #endif
1482 	struct ip_options_rcu *inet_opt;
1483 
1484 	if (sk_acceptq_is_full(sk))
1485 		goto exit_overflow;
1486 
1487 	newsk = tcp_create_openreq_child(sk, req, skb);
1488 	if (!newsk)
1489 		goto exit_nonewsk;
1490 
1491 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1492 	inet_sk_rx_dst_set(newsk, skb);
1493 
1494 	newtp		      = tcp_sk(newsk);
1495 	newinet		      = inet_sk(newsk);
1496 	ireq		      = inet_rsk(req);
1497 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1498 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1499 	newsk->sk_bound_dev_if = ireq->ir_iif;
1500 	newinet->inet_saddr   = ireq->ir_loc_addr;
1501 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1502 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1503 	newinet->mc_index     = inet_iif(skb);
1504 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1505 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1506 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1507 	if (inet_opt)
1508 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1509 	newinet->inet_id = prandom_u32();
1510 
1511 	if (!dst) {
1512 		dst = inet_csk_route_child_sock(sk, newsk, req);
1513 		if (!dst)
1514 			goto put_and_exit;
1515 	} else {
1516 		/* syncookie case : see end of cookie_v4_check() */
1517 	}
1518 	sk_setup_caps(newsk, dst);
1519 
1520 	tcp_ca_openreq_child(newsk, dst);
1521 
1522 	tcp_sync_mss(newsk, dst_mtu(dst));
1523 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1524 
1525 	tcp_initialize_rcv_mss(newsk);
1526 
1527 #ifdef CONFIG_TCP_MD5SIG
1528 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1529 	/* Copy over the MD5 key from the original socket */
1530 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1531 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1532 	if (key) {
1533 		/*
1534 		 * We're using one, so create a matching key
1535 		 * on the newsk structure. If we fail to get
1536 		 * memory, then we end up not copying the key
1537 		 * across. Shucks.
1538 		 */
1539 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1540 			       key->key, key->keylen, GFP_ATOMIC);
1541 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1542 	}
1543 #endif
1544 
1545 	if (__inet_inherit_port(sk, newsk) < 0)
1546 		goto put_and_exit;
1547 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1548 	if (likely(*own_req)) {
1549 		tcp_move_syn(newtp, req);
1550 		ireq->ireq_opt = NULL;
1551 	} else {
1552 		newinet->inet_opt = NULL;
1553 	}
1554 	return newsk;
1555 
1556 exit_overflow:
1557 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1558 exit_nonewsk:
1559 	dst_release(dst);
1560 exit:
1561 	tcp_listendrop(sk);
1562 	return NULL;
1563 put_and_exit:
1564 	newinet->inet_opt = NULL;
1565 	inet_csk_prepare_forced_close(newsk);
1566 	tcp_done(newsk);
1567 	goto exit;
1568 }
1569 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1570 
1571 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1572 {
1573 #ifdef CONFIG_SYN_COOKIES
1574 	const struct tcphdr *th = tcp_hdr(skb);
1575 
1576 	if (!th->syn)
1577 		sk = cookie_v4_check(sk, skb);
1578 #endif
1579 	return sk;
1580 }
1581 
1582 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1583 			 struct tcphdr *th, u32 *cookie)
1584 {
1585 	u16 mss = 0;
1586 #ifdef CONFIG_SYN_COOKIES
1587 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1588 				    &tcp_request_sock_ipv4_ops, sk, th);
1589 	if (mss) {
1590 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1591 		tcp_synq_overflow(sk);
1592 	}
1593 #endif
1594 	return mss;
1595 }
1596 
1597 /* The socket must have it's spinlock held when we get
1598  * here, unless it is a TCP_LISTEN socket.
1599  *
1600  * We have a potential double-lock case here, so even when
1601  * doing backlog processing we use the BH locking scheme.
1602  * This is because we cannot sleep with the original spinlock
1603  * held.
1604  */
1605 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1606 {
1607 	struct sock *rsk;
1608 
1609 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1610 		struct dst_entry *dst = sk->sk_rx_dst;
1611 
1612 		sock_rps_save_rxhash(sk, skb);
1613 		sk_mark_napi_id(sk, skb);
1614 		if (dst) {
1615 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1616 			    !dst->ops->check(dst, 0)) {
1617 				dst_release(dst);
1618 				sk->sk_rx_dst = NULL;
1619 			}
1620 		}
1621 		tcp_rcv_established(sk, skb);
1622 		return 0;
1623 	}
1624 
1625 	if (tcp_checksum_complete(skb))
1626 		goto csum_err;
1627 
1628 	if (sk->sk_state == TCP_LISTEN) {
1629 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1630 
1631 		if (!nsk)
1632 			goto discard;
1633 		if (nsk != sk) {
1634 			if (tcp_child_process(sk, nsk, skb)) {
1635 				rsk = nsk;
1636 				goto reset;
1637 			}
1638 			return 0;
1639 		}
1640 	} else
1641 		sock_rps_save_rxhash(sk, skb);
1642 
1643 	if (tcp_rcv_state_process(sk, skb)) {
1644 		rsk = sk;
1645 		goto reset;
1646 	}
1647 	return 0;
1648 
1649 reset:
1650 	tcp_v4_send_reset(rsk, skb);
1651 discard:
1652 	kfree_skb(skb);
1653 	/* Be careful here. If this function gets more complicated and
1654 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1655 	 * might be destroyed here. This current version compiles correctly,
1656 	 * but you have been warned.
1657 	 */
1658 	return 0;
1659 
1660 csum_err:
1661 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1662 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1663 	goto discard;
1664 }
1665 EXPORT_SYMBOL(tcp_v4_do_rcv);
1666 
1667 int tcp_v4_early_demux(struct sk_buff *skb)
1668 {
1669 	const struct iphdr *iph;
1670 	const struct tcphdr *th;
1671 	struct sock *sk;
1672 
1673 	if (skb->pkt_type != PACKET_HOST)
1674 		return 0;
1675 
1676 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1677 		return 0;
1678 
1679 	iph = ip_hdr(skb);
1680 	th = tcp_hdr(skb);
1681 
1682 	if (th->doff < sizeof(struct tcphdr) / 4)
1683 		return 0;
1684 
1685 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1686 				       iph->saddr, th->source,
1687 				       iph->daddr, ntohs(th->dest),
1688 				       skb->skb_iif, inet_sdif(skb));
1689 	if (sk) {
1690 		skb->sk = sk;
1691 		skb->destructor = sock_edemux;
1692 		if (sk_fullsock(sk)) {
1693 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1694 
1695 			if (dst)
1696 				dst = dst_check(dst, 0);
1697 			if (dst &&
1698 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1699 				skb_dst_set_noref(skb, dst);
1700 		}
1701 	}
1702 	return 0;
1703 }
1704 
1705 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1706 {
1707 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1708 	struct skb_shared_info *shinfo;
1709 	const struct tcphdr *th;
1710 	struct tcphdr *thtail;
1711 	struct sk_buff *tail;
1712 	unsigned int hdrlen;
1713 	bool fragstolen;
1714 	u32 gso_segs;
1715 	int delta;
1716 
1717 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1718 	 * we can fix skb->truesize to its real value to avoid future drops.
1719 	 * This is valid because skb is not yet charged to the socket.
1720 	 * It has been noticed pure SACK packets were sometimes dropped
1721 	 * (if cooked by drivers without copybreak feature).
1722 	 */
1723 	skb_condense(skb);
1724 
1725 	skb_dst_drop(skb);
1726 
1727 	if (unlikely(tcp_checksum_complete(skb))) {
1728 		bh_unlock_sock(sk);
1729 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1730 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1731 		return true;
1732 	}
1733 
1734 	/* Attempt coalescing to last skb in backlog, even if we are
1735 	 * above the limits.
1736 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1737 	 */
1738 	th = (const struct tcphdr *)skb->data;
1739 	hdrlen = th->doff * 4;
1740 	shinfo = skb_shinfo(skb);
1741 
1742 	if (!shinfo->gso_size)
1743 		shinfo->gso_size = skb->len - hdrlen;
1744 
1745 	if (!shinfo->gso_segs)
1746 		shinfo->gso_segs = 1;
1747 
1748 	tail = sk->sk_backlog.tail;
1749 	if (!tail)
1750 		goto no_coalesce;
1751 	thtail = (struct tcphdr *)tail->data;
1752 
1753 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1754 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1755 	    ((TCP_SKB_CB(tail)->tcp_flags |
1756 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1757 	    !((TCP_SKB_CB(tail)->tcp_flags &
1758 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1759 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1760 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1761 #ifdef CONFIG_TLS_DEVICE
1762 	    tail->decrypted != skb->decrypted ||
1763 #endif
1764 	    thtail->doff != th->doff ||
1765 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1766 		goto no_coalesce;
1767 
1768 	__skb_pull(skb, hdrlen);
1769 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1770 		thtail->window = th->window;
1771 
1772 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1773 
1774 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1775 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1776 
1777 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1778 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1779 		 * is not entered if we append a packet with a FIN.
1780 		 * SYN, RST, URG are not present.
1781 		 * ACK is set on both packets.
1782 		 * PSH : we do not really care in TCP stack,
1783 		 *       at least for 'GRO' packets.
1784 		 */
1785 		thtail->fin |= th->fin;
1786 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1787 
1788 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1789 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1790 			tail->tstamp = skb->tstamp;
1791 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1792 		}
1793 
1794 		/* Not as strict as GRO. We only need to carry mss max value */
1795 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1796 						 skb_shinfo(tail)->gso_size);
1797 
1798 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1799 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1800 
1801 		sk->sk_backlog.len += delta;
1802 		__NET_INC_STATS(sock_net(sk),
1803 				LINUX_MIB_TCPBACKLOGCOALESCE);
1804 		kfree_skb_partial(skb, fragstolen);
1805 		return false;
1806 	}
1807 	__skb_push(skb, hdrlen);
1808 
1809 no_coalesce:
1810 	/* Only socket owner can try to collapse/prune rx queues
1811 	 * to reduce memory overhead, so add a little headroom here.
1812 	 * Few sockets backlog are possibly concurrently non empty.
1813 	 */
1814 	limit += 64*1024;
1815 
1816 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1817 		bh_unlock_sock(sk);
1818 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1819 		return true;
1820 	}
1821 	return false;
1822 }
1823 EXPORT_SYMBOL(tcp_add_backlog);
1824 
1825 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1826 {
1827 	struct tcphdr *th = (struct tcphdr *)skb->data;
1828 
1829 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1830 }
1831 EXPORT_SYMBOL(tcp_filter);
1832 
1833 static void tcp_v4_restore_cb(struct sk_buff *skb)
1834 {
1835 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1836 		sizeof(struct inet_skb_parm));
1837 }
1838 
1839 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1840 			   const struct tcphdr *th)
1841 {
1842 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1843 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1844 	 */
1845 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1846 		sizeof(struct inet_skb_parm));
1847 	barrier();
1848 
1849 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1850 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1851 				    skb->len - th->doff * 4);
1852 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1853 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1854 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1855 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1856 	TCP_SKB_CB(skb)->sacked	 = 0;
1857 	TCP_SKB_CB(skb)->has_rxtstamp =
1858 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1859 }
1860 
1861 /*
1862  *	From tcp_input.c
1863  */
1864 
1865 int tcp_v4_rcv(struct sk_buff *skb)
1866 {
1867 	struct net *net = dev_net(skb->dev);
1868 	struct sk_buff *skb_to_free;
1869 	int sdif = inet_sdif(skb);
1870 	int dif = inet_iif(skb);
1871 	const struct iphdr *iph;
1872 	const struct tcphdr *th;
1873 	bool refcounted;
1874 	struct sock *sk;
1875 	int ret;
1876 
1877 	if (skb->pkt_type != PACKET_HOST)
1878 		goto discard_it;
1879 
1880 	/* Count it even if it's bad */
1881 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1882 
1883 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1884 		goto discard_it;
1885 
1886 	th = (const struct tcphdr *)skb->data;
1887 
1888 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1889 		goto bad_packet;
1890 	if (!pskb_may_pull(skb, th->doff * 4))
1891 		goto discard_it;
1892 
1893 	/* An explanation is required here, I think.
1894 	 * Packet length and doff are validated by header prediction,
1895 	 * provided case of th->doff==0 is eliminated.
1896 	 * So, we defer the checks. */
1897 
1898 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1899 		goto csum_error;
1900 
1901 	th = (const struct tcphdr *)skb->data;
1902 	iph = ip_hdr(skb);
1903 lookup:
1904 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1905 			       th->dest, sdif, &refcounted);
1906 	if (!sk)
1907 		goto no_tcp_socket;
1908 
1909 process:
1910 	if (sk->sk_state == TCP_TIME_WAIT)
1911 		goto do_time_wait;
1912 
1913 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1914 		struct request_sock *req = inet_reqsk(sk);
1915 		bool req_stolen = false;
1916 		struct sock *nsk;
1917 
1918 		sk = req->rsk_listener;
1919 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1920 			sk_drops_add(sk, skb);
1921 			reqsk_put(req);
1922 			goto discard_it;
1923 		}
1924 		if (tcp_checksum_complete(skb)) {
1925 			reqsk_put(req);
1926 			goto csum_error;
1927 		}
1928 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1929 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1930 			goto lookup;
1931 		}
1932 		/* We own a reference on the listener, increase it again
1933 		 * as we might lose it too soon.
1934 		 */
1935 		sock_hold(sk);
1936 		refcounted = true;
1937 		nsk = NULL;
1938 		if (!tcp_filter(sk, skb)) {
1939 			th = (const struct tcphdr *)skb->data;
1940 			iph = ip_hdr(skb);
1941 			tcp_v4_fill_cb(skb, iph, th);
1942 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1943 		}
1944 		if (!nsk) {
1945 			reqsk_put(req);
1946 			if (req_stolen) {
1947 				/* Another cpu got exclusive access to req
1948 				 * and created a full blown socket.
1949 				 * Try to feed this packet to this socket
1950 				 * instead of discarding it.
1951 				 */
1952 				tcp_v4_restore_cb(skb);
1953 				sock_put(sk);
1954 				goto lookup;
1955 			}
1956 			goto discard_and_relse;
1957 		}
1958 		if (nsk == sk) {
1959 			reqsk_put(req);
1960 			tcp_v4_restore_cb(skb);
1961 		} else if (tcp_child_process(sk, nsk, skb)) {
1962 			tcp_v4_send_reset(nsk, skb);
1963 			goto discard_and_relse;
1964 		} else {
1965 			sock_put(sk);
1966 			return 0;
1967 		}
1968 	}
1969 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1970 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1971 		goto discard_and_relse;
1972 	}
1973 
1974 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1975 		goto discard_and_relse;
1976 
1977 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1978 		goto discard_and_relse;
1979 
1980 	nf_reset_ct(skb);
1981 
1982 	if (tcp_filter(sk, skb))
1983 		goto discard_and_relse;
1984 	th = (const struct tcphdr *)skb->data;
1985 	iph = ip_hdr(skb);
1986 	tcp_v4_fill_cb(skb, iph, th);
1987 
1988 	skb->dev = NULL;
1989 
1990 	if (sk->sk_state == TCP_LISTEN) {
1991 		ret = tcp_v4_do_rcv(sk, skb);
1992 		goto put_and_return;
1993 	}
1994 
1995 	sk_incoming_cpu_update(sk);
1996 
1997 	bh_lock_sock_nested(sk);
1998 	tcp_segs_in(tcp_sk(sk), skb);
1999 	ret = 0;
2000 	if (!sock_owned_by_user(sk)) {
2001 		skb_to_free = sk->sk_rx_skb_cache;
2002 		sk->sk_rx_skb_cache = NULL;
2003 		ret = tcp_v4_do_rcv(sk, skb);
2004 	} else {
2005 		if (tcp_add_backlog(sk, skb))
2006 			goto discard_and_relse;
2007 		skb_to_free = NULL;
2008 	}
2009 	bh_unlock_sock(sk);
2010 	if (skb_to_free)
2011 		__kfree_skb(skb_to_free);
2012 
2013 put_and_return:
2014 	if (refcounted)
2015 		sock_put(sk);
2016 
2017 	return ret;
2018 
2019 no_tcp_socket:
2020 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2021 		goto discard_it;
2022 
2023 	tcp_v4_fill_cb(skb, iph, th);
2024 
2025 	if (tcp_checksum_complete(skb)) {
2026 csum_error:
2027 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2028 bad_packet:
2029 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2030 	} else {
2031 		tcp_v4_send_reset(NULL, skb);
2032 	}
2033 
2034 discard_it:
2035 	/* Discard frame. */
2036 	kfree_skb(skb);
2037 	return 0;
2038 
2039 discard_and_relse:
2040 	sk_drops_add(sk, skb);
2041 	if (refcounted)
2042 		sock_put(sk);
2043 	goto discard_it;
2044 
2045 do_time_wait:
2046 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2047 		inet_twsk_put(inet_twsk(sk));
2048 		goto discard_it;
2049 	}
2050 
2051 	tcp_v4_fill_cb(skb, iph, th);
2052 
2053 	if (tcp_checksum_complete(skb)) {
2054 		inet_twsk_put(inet_twsk(sk));
2055 		goto csum_error;
2056 	}
2057 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2058 	case TCP_TW_SYN: {
2059 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2060 							&tcp_hashinfo, skb,
2061 							__tcp_hdrlen(th),
2062 							iph->saddr, th->source,
2063 							iph->daddr, th->dest,
2064 							inet_iif(skb),
2065 							sdif);
2066 		if (sk2) {
2067 			inet_twsk_deschedule_put(inet_twsk(sk));
2068 			sk = sk2;
2069 			tcp_v4_restore_cb(skb);
2070 			refcounted = false;
2071 			goto process;
2072 		}
2073 	}
2074 		/* to ACK */
2075 		fallthrough;
2076 	case TCP_TW_ACK:
2077 		tcp_v4_timewait_ack(sk, skb);
2078 		break;
2079 	case TCP_TW_RST:
2080 		tcp_v4_send_reset(sk, skb);
2081 		inet_twsk_deschedule_put(inet_twsk(sk));
2082 		goto discard_it;
2083 	case TCP_TW_SUCCESS:;
2084 	}
2085 	goto discard_it;
2086 }
2087 
2088 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2089 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2090 	.twsk_unique	= tcp_twsk_unique,
2091 	.twsk_destructor= tcp_twsk_destructor,
2092 };
2093 
2094 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2095 {
2096 	struct dst_entry *dst = skb_dst(skb);
2097 
2098 	if (dst && dst_hold_safe(dst)) {
2099 		sk->sk_rx_dst = dst;
2100 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2101 	}
2102 }
2103 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2104 
2105 const struct inet_connection_sock_af_ops ipv4_specific = {
2106 	.queue_xmit	   = ip_queue_xmit,
2107 	.send_check	   = tcp_v4_send_check,
2108 	.rebuild_header	   = inet_sk_rebuild_header,
2109 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2110 	.conn_request	   = tcp_v4_conn_request,
2111 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2112 	.net_header_len	   = sizeof(struct iphdr),
2113 	.setsockopt	   = ip_setsockopt,
2114 	.getsockopt	   = ip_getsockopt,
2115 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2116 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2117 #ifdef CONFIG_COMPAT
2118 	.compat_setsockopt = compat_ip_setsockopt,
2119 	.compat_getsockopt = compat_ip_getsockopt,
2120 #endif
2121 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2122 };
2123 EXPORT_SYMBOL(ipv4_specific);
2124 
2125 #ifdef CONFIG_TCP_MD5SIG
2126 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2127 	.md5_lookup		= tcp_v4_md5_lookup,
2128 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2129 	.md5_parse		= tcp_v4_parse_md5_keys,
2130 };
2131 #endif
2132 
2133 /* NOTE: A lot of things set to zero explicitly by call to
2134  *       sk_alloc() so need not be done here.
2135  */
2136 static int tcp_v4_init_sock(struct sock *sk)
2137 {
2138 	struct inet_connection_sock *icsk = inet_csk(sk);
2139 
2140 	tcp_init_sock(sk);
2141 
2142 	icsk->icsk_af_ops = &ipv4_specific;
2143 
2144 #ifdef CONFIG_TCP_MD5SIG
2145 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2146 #endif
2147 
2148 	return 0;
2149 }
2150 
2151 void tcp_v4_destroy_sock(struct sock *sk)
2152 {
2153 	struct tcp_sock *tp = tcp_sk(sk);
2154 
2155 	trace_tcp_destroy_sock(sk);
2156 
2157 	tcp_clear_xmit_timers(sk);
2158 
2159 	tcp_cleanup_congestion_control(sk);
2160 
2161 	tcp_cleanup_ulp(sk);
2162 
2163 	/* Cleanup up the write buffer. */
2164 	tcp_write_queue_purge(sk);
2165 
2166 	/* Check if we want to disable active TFO */
2167 	tcp_fastopen_active_disable_ofo_check(sk);
2168 
2169 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2170 	skb_rbtree_purge(&tp->out_of_order_queue);
2171 
2172 #ifdef CONFIG_TCP_MD5SIG
2173 	/* Clean up the MD5 key list, if any */
2174 	if (tp->md5sig_info) {
2175 		tcp_clear_md5_list(sk);
2176 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2177 		tp->md5sig_info = NULL;
2178 	}
2179 #endif
2180 
2181 	/* Clean up a referenced TCP bind bucket. */
2182 	if (inet_csk(sk)->icsk_bind_hash)
2183 		inet_put_port(sk);
2184 
2185 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2186 
2187 	/* If socket is aborted during connect operation */
2188 	tcp_free_fastopen_req(tp);
2189 	tcp_fastopen_destroy_cipher(sk);
2190 	tcp_saved_syn_free(tp);
2191 
2192 	sk_sockets_allocated_dec(sk);
2193 }
2194 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2195 
2196 #ifdef CONFIG_PROC_FS
2197 /* Proc filesystem TCP sock list dumping. */
2198 
2199 /*
2200  * Get next listener socket follow cur.  If cur is NULL, get first socket
2201  * starting from bucket given in st->bucket; when st->bucket is zero the
2202  * very first socket in the hash table is returned.
2203  */
2204 static void *listening_get_next(struct seq_file *seq, void *cur)
2205 {
2206 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2207 	struct tcp_iter_state *st = seq->private;
2208 	struct net *net = seq_file_net(seq);
2209 	struct inet_listen_hashbucket *ilb;
2210 	struct hlist_nulls_node *node;
2211 	struct sock *sk = cur;
2212 
2213 	if (!sk) {
2214 get_head:
2215 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2216 		spin_lock(&ilb->lock);
2217 		sk = sk_nulls_head(&ilb->nulls_head);
2218 		st->offset = 0;
2219 		goto get_sk;
2220 	}
2221 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2222 	++st->num;
2223 	++st->offset;
2224 
2225 	sk = sk_nulls_next(sk);
2226 get_sk:
2227 	sk_nulls_for_each_from(sk, node) {
2228 		if (!net_eq(sock_net(sk), net))
2229 			continue;
2230 		if (sk->sk_family == afinfo->family)
2231 			return sk;
2232 	}
2233 	spin_unlock(&ilb->lock);
2234 	st->offset = 0;
2235 	if (++st->bucket < INET_LHTABLE_SIZE)
2236 		goto get_head;
2237 	return NULL;
2238 }
2239 
2240 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2241 {
2242 	struct tcp_iter_state *st = seq->private;
2243 	void *rc;
2244 
2245 	st->bucket = 0;
2246 	st->offset = 0;
2247 	rc = listening_get_next(seq, NULL);
2248 
2249 	while (rc && *pos) {
2250 		rc = listening_get_next(seq, rc);
2251 		--*pos;
2252 	}
2253 	return rc;
2254 }
2255 
2256 static inline bool empty_bucket(const struct tcp_iter_state *st)
2257 {
2258 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2259 }
2260 
2261 /*
2262  * Get first established socket starting from bucket given in st->bucket.
2263  * If st->bucket is zero, the very first socket in the hash is returned.
2264  */
2265 static void *established_get_first(struct seq_file *seq)
2266 {
2267 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2268 	struct tcp_iter_state *st = seq->private;
2269 	struct net *net = seq_file_net(seq);
2270 	void *rc = NULL;
2271 
2272 	st->offset = 0;
2273 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2274 		struct sock *sk;
2275 		struct hlist_nulls_node *node;
2276 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2277 
2278 		/* Lockless fast path for the common case of empty buckets */
2279 		if (empty_bucket(st))
2280 			continue;
2281 
2282 		spin_lock_bh(lock);
2283 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2284 			if (sk->sk_family != afinfo->family ||
2285 			    !net_eq(sock_net(sk), net)) {
2286 				continue;
2287 			}
2288 			rc = sk;
2289 			goto out;
2290 		}
2291 		spin_unlock_bh(lock);
2292 	}
2293 out:
2294 	return rc;
2295 }
2296 
2297 static void *established_get_next(struct seq_file *seq, void *cur)
2298 {
2299 	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
2300 	struct sock *sk = cur;
2301 	struct hlist_nulls_node *node;
2302 	struct tcp_iter_state *st = seq->private;
2303 	struct net *net = seq_file_net(seq);
2304 
2305 	++st->num;
2306 	++st->offset;
2307 
2308 	sk = sk_nulls_next(sk);
2309 
2310 	sk_nulls_for_each_from(sk, node) {
2311 		if (sk->sk_family == afinfo->family &&
2312 		    net_eq(sock_net(sk), net))
2313 			return sk;
2314 	}
2315 
2316 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2317 	++st->bucket;
2318 	return established_get_first(seq);
2319 }
2320 
2321 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2322 {
2323 	struct tcp_iter_state *st = seq->private;
2324 	void *rc;
2325 
2326 	st->bucket = 0;
2327 	rc = established_get_first(seq);
2328 
2329 	while (rc && pos) {
2330 		rc = established_get_next(seq, rc);
2331 		--pos;
2332 	}
2333 	return rc;
2334 }
2335 
2336 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2337 {
2338 	void *rc;
2339 	struct tcp_iter_state *st = seq->private;
2340 
2341 	st->state = TCP_SEQ_STATE_LISTENING;
2342 	rc	  = listening_get_idx(seq, &pos);
2343 
2344 	if (!rc) {
2345 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2346 		rc	  = established_get_idx(seq, pos);
2347 	}
2348 
2349 	return rc;
2350 }
2351 
2352 static void *tcp_seek_last_pos(struct seq_file *seq)
2353 {
2354 	struct tcp_iter_state *st = seq->private;
2355 	int offset = st->offset;
2356 	int orig_num = st->num;
2357 	void *rc = NULL;
2358 
2359 	switch (st->state) {
2360 	case TCP_SEQ_STATE_LISTENING:
2361 		if (st->bucket >= INET_LHTABLE_SIZE)
2362 			break;
2363 		st->state = TCP_SEQ_STATE_LISTENING;
2364 		rc = listening_get_next(seq, NULL);
2365 		while (offset-- && rc)
2366 			rc = listening_get_next(seq, rc);
2367 		if (rc)
2368 			break;
2369 		st->bucket = 0;
2370 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2371 		fallthrough;
2372 	case TCP_SEQ_STATE_ESTABLISHED:
2373 		if (st->bucket > tcp_hashinfo.ehash_mask)
2374 			break;
2375 		rc = established_get_first(seq);
2376 		while (offset-- && rc)
2377 			rc = established_get_next(seq, rc);
2378 	}
2379 
2380 	st->num = orig_num;
2381 
2382 	return rc;
2383 }
2384 
2385 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2386 {
2387 	struct tcp_iter_state *st = seq->private;
2388 	void *rc;
2389 
2390 	if (*pos && *pos == st->last_pos) {
2391 		rc = tcp_seek_last_pos(seq);
2392 		if (rc)
2393 			goto out;
2394 	}
2395 
2396 	st->state = TCP_SEQ_STATE_LISTENING;
2397 	st->num = 0;
2398 	st->bucket = 0;
2399 	st->offset = 0;
2400 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2401 
2402 out:
2403 	st->last_pos = *pos;
2404 	return rc;
2405 }
2406 EXPORT_SYMBOL(tcp_seq_start);
2407 
2408 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2409 {
2410 	struct tcp_iter_state *st = seq->private;
2411 	void *rc = NULL;
2412 
2413 	if (v == SEQ_START_TOKEN) {
2414 		rc = tcp_get_idx(seq, 0);
2415 		goto out;
2416 	}
2417 
2418 	switch (st->state) {
2419 	case TCP_SEQ_STATE_LISTENING:
2420 		rc = listening_get_next(seq, v);
2421 		if (!rc) {
2422 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2423 			st->bucket = 0;
2424 			st->offset = 0;
2425 			rc	  = established_get_first(seq);
2426 		}
2427 		break;
2428 	case TCP_SEQ_STATE_ESTABLISHED:
2429 		rc = established_get_next(seq, v);
2430 		break;
2431 	}
2432 out:
2433 	++*pos;
2434 	st->last_pos = *pos;
2435 	return rc;
2436 }
2437 EXPORT_SYMBOL(tcp_seq_next);
2438 
2439 void tcp_seq_stop(struct seq_file *seq, void *v)
2440 {
2441 	struct tcp_iter_state *st = seq->private;
2442 
2443 	switch (st->state) {
2444 	case TCP_SEQ_STATE_LISTENING:
2445 		if (v != SEQ_START_TOKEN)
2446 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2447 		break;
2448 	case TCP_SEQ_STATE_ESTABLISHED:
2449 		if (v)
2450 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2451 		break;
2452 	}
2453 }
2454 EXPORT_SYMBOL(tcp_seq_stop);
2455 
2456 static void get_openreq4(const struct request_sock *req,
2457 			 struct seq_file *f, int i)
2458 {
2459 	const struct inet_request_sock *ireq = inet_rsk(req);
2460 	long delta = req->rsk_timer.expires - jiffies;
2461 
2462 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2463 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2464 		i,
2465 		ireq->ir_loc_addr,
2466 		ireq->ir_num,
2467 		ireq->ir_rmt_addr,
2468 		ntohs(ireq->ir_rmt_port),
2469 		TCP_SYN_RECV,
2470 		0, 0, /* could print option size, but that is af dependent. */
2471 		1,    /* timers active (only the expire timer) */
2472 		jiffies_delta_to_clock_t(delta),
2473 		req->num_timeout,
2474 		from_kuid_munged(seq_user_ns(f),
2475 				 sock_i_uid(req->rsk_listener)),
2476 		0,  /* non standard timer */
2477 		0, /* open_requests have no inode */
2478 		0,
2479 		req);
2480 }
2481 
2482 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2483 {
2484 	int timer_active;
2485 	unsigned long timer_expires;
2486 	const struct tcp_sock *tp = tcp_sk(sk);
2487 	const struct inet_connection_sock *icsk = inet_csk(sk);
2488 	const struct inet_sock *inet = inet_sk(sk);
2489 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2490 	__be32 dest = inet->inet_daddr;
2491 	__be32 src = inet->inet_rcv_saddr;
2492 	__u16 destp = ntohs(inet->inet_dport);
2493 	__u16 srcp = ntohs(inet->inet_sport);
2494 	int rx_queue;
2495 	int state;
2496 
2497 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2498 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2499 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2500 		timer_active	= 1;
2501 		timer_expires	= icsk->icsk_timeout;
2502 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2503 		timer_active	= 4;
2504 		timer_expires	= icsk->icsk_timeout;
2505 	} else if (timer_pending(&sk->sk_timer)) {
2506 		timer_active	= 2;
2507 		timer_expires	= sk->sk_timer.expires;
2508 	} else {
2509 		timer_active	= 0;
2510 		timer_expires = jiffies;
2511 	}
2512 
2513 	state = inet_sk_state_load(sk);
2514 	if (state == TCP_LISTEN)
2515 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2516 	else
2517 		/* Because we don't lock the socket,
2518 		 * we might find a transient negative value.
2519 		 */
2520 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2521 				      READ_ONCE(tp->copied_seq), 0);
2522 
2523 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2524 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2525 		i, src, srcp, dest, destp, state,
2526 		READ_ONCE(tp->write_seq) - tp->snd_una,
2527 		rx_queue,
2528 		timer_active,
2529 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2530 		icsk->icsk_retransmits,
2531 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2532 		icsk->icsk_probes_out,
2533 		sock_i_ino(sk),
2534 		refcount_read(&sk->sk_refcnt), sk,
2535 		jiffies_to_clock_t(icsk->icsk_rto),
2536 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2537 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2538 		tp->snd_cwnd,
2539 		state == TCP_LISTEN ?
2540 		    fastopenq->max_qlen :
2541 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2542 }
2543 
2544 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2545 			       struct seq_file *f, int i)
2546 {
2547 	long delta = tw->tw_timer.expires - jiffies;
2548 	__be32 dest, src;
2549 	__u16 destp, srcp;
2550 
2551 	dest  = tw->tw_daddr;
2552 	src   = tw->tw_rcv_saddr;
2553 	destp = ntohs(tw->tw_dport);
2554 	srcp  = ntohs(tw->tw_sport);
2555 
2556 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2557 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2558 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2559 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2560 		refcount_read(&tw->tw_refcnt), tw);
2561 }
2562 
2563 #define TMPSZ 150
2564 
2565 static int tcp4_seq_show(struct seq_file *seq, void *v)
2566 {
2567 	struct tcp_iter_state *st;
2568 	struct sock *sk = v;
2569 
2570 	seq_setwidth(seq, TMPSZ - 1);
2571 	if (v == SEQ_START_TOKEN) {
2572 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2573 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2574 			   "inode");
2575 		goto out;
2576 	}
2577 	st = seq->private;
2578 
2579 	if (sk->sk_state == TCP_TIME_WAIT)
2580 		get_timewait4_sock(v, seq, st->num);
2581 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2582 		get_openreq4(v, seq, st->num);
2583 	else
2584 		get_tcp4_sock(v, seq, st->num);
2585 out:
2586 	seq_pad(seq, '\n');
2587 	return 0;
2588 }
2589 
2590 static const struct seq_operations tcp4_seq_ops = {
2591 	.show		= tcp4_seq_show,
2592 	.start		= tcp_seq_start,
2593 	.next		= tcp_seq_next,
2594 	.stop		= tcp_seq_stop,
2595 };
2596 
2597 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2598 	.family		= AF_INET,
2599 };
2600 
2601 static int __net_init tcp4_proc_init_net(struct net *net)
2602 {
2603 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2604 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2605 		return -ENOMEM;
2606 	return 0;
2607 }
2608 
2609 static void __net_exit tcp4_proc_exit_net(struct net *net)
2610 {
2611 	remove_proc_entry("tcp", net->proc_net);
2612 }
2613 
2614 static struct pernet_operations tcp4_net_ops = {
2615 	.init = tcp4_proc_init_net,
2616 	.exit = tcp4_proc_exit_net,
2617 };
2618 
2619 int __init tcp4_proc_init(void)
2620 {
2621 	return register_pernet_subsys(&tcp4_net_ops);
2622 }
2623 
2624 void tcp4_proc_exit(void)
2625 {
2626 	unregister_pernet_subsys(&tcp4_net_ops);
2627 }
2628 #endif /* CONFIG_PROC_FS */
2629 
2630 struct proto tcp_prot = {
2631 	.name			= "TCP",
2632 	.owner			= THIS_MODULE,
2633 	.close			= tcp_close,
2634 	.pre_connect		= tcp_v4_pre_connect,
2635 	.connect		= tcp_v4_connect,
2636 	.disconnect		= tcp_disconnect,
2637 	.accept			= inet_csk_accept,
2638 	.ioctl			= tcp_ioctl,
2639 	.init			= tcp_v4_init_sock,
2640 	.destroy		= tcp_v4_destroy_sock,
2641 	.shutdown		= tcp_shutdown,
2642 	.setsockopt		= tcp_setsockopt,
2643 	.getsockopt		= tcp_getsockopt,
2644 	.keepalive		= tcp_set_keepalive,
2645 	.recvmsg		= tcp_recvmsg,
2646 	.sendmsg		= tcp_sendmsg,
2647 	.sendpage		= tcp_sendpage,
2648 	.backlog_rcv		= tcp_v4_do_rcv,
2649 	.release_cb		= tcp_release_cb,
2650 	.hash			= inet_hash,
2651 	.unhash			= inet_unhash,
2652 	.get_port		= inet_csk_get_port,
2653 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2654 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2655 	.stream_memory_free	= tcp_stream_memory_free,
2656 	.sockets_allocated	= &tcp_sockets_allocated,
2657 	.orphan_count		= &tcp_orphan_count,
2658 	.memory_allocated	= &tcp_memory_allocated,
2659 	.memory_pressure	= &tcp_memory_pressure,
2660 	.sysctl_mem		= sysctl_tcp_mem,
2661 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2662 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2663 	.max_header		= MAX_TCP_HEADER,
2664 	.obj_size		= sizeof(struct tcp_sock),
2665 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2666 	.twsk_prot		= &tcp_timewait_sock_ops,
2667 	.rsk_prot		= &tcp_request_sock_ops,
2668 	.h.hashinfo		= &tcp_hashinfo,
2669 	.no_autobind		= true,
2670 #ifdef CONFIG_COMPAT
2671 	.compat_setsockopt	= compat_tcp_setsockopt,
2672 	.compat_getsockopt	= compat_tcp_getsockopt,
2673 #endif
2674 	.diag_destroy		= tcp_abort,
2675 };
2676 EXPORT_SYMBOL(tcp_prot);
2677 
2678 static void __net_exit tcp_sk_exit(struct net *net)
2679 {
2680 	int cpu;
2681 
2682 	if (net->ipv4.tcp_congestion_control)
2683 		bpf_module_put(net->ipv4.tcp_congestion_control,
2684 			       net->ipv4.tcp_congestion_control->owner);
2685 
2686 	for_each_possible_cpu(cpu)
2687 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2688 	free_percpu(net->ipv4.tcp_sk);
2689 }
2690 
2691 static int __net_init tcp_sk_init(struct net *net)
2692 {
2693 	int res, cpu, cnt;
2694 
2695 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2696 	if (!net->ipv4.tcp_sk)
2697 		return -ENOMEM;
2698 
2699 	for_each_possible_cpu(cpu) {
2700 		struct sock *sk;
2701 
2702 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2703 					   IPPROTO_TCP, net);
2704 		if (res)
2705 			goto fail;
2706 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2707 
2708 		/* Please enforce IP_DF and IPID==0 for RST and
2709 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2710 		 */
2711 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2712 
2713 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2714 	}
2715 
2716 	net->ipv4.sysctl_tcp_ecn = 2;
2717 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2718 
2719 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2720 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2721 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2722 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2723 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2724 
2725 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2726 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2727 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2728 
2729 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2730 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2731 	net->ipv4.sysctl_tcp_syncookies = 1;
2732 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2733 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2734 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2735 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2736 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2737 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2738 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2739 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2740 
2741 	cnt = tcp_hashinfo.ehash_mask + 1;
2742 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2743 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2744 
2745 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2746 	net->ipv4.sysctl_tcp_sack = 1;
2747 	net->ipv4.sysctl_tcp_window_scaling = 1;
2748 	net->ipv4.sysctl_tcp_timestamps = 1;
2749 	net->ipv4.sysctl_tcp_early_retrans = 3;
2750 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2751 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2752 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2753 	net->ipv4.sysctl_tcp_max_reordering = 300;
2754 	net->ipv4.sysctl_tcp_dsack = 1;
2755 	net->ipv4.sysctl_tcp_app_win = 31;
2756 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2757 	net->ipv4.sysctl_tcp_frto = 2;
2758 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2759 	/* This limits the percentage of the congestion window which we
2760 	 * will allow a single TSO frame to consume.  Building TSO frames
2761 	 * which are too large can cause TCP streams to be bursty.
2762 	 */
2763 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2764 	/* Default TSQ limit of 16 TSO segments */
2765 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2766 	/* rfc5961 challenge ack rate limiting */
2767 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2768 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2769 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2770 	net->ipv4.sysctl_tcp_autocorking = 1;
2771 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2772 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2773 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2774 	if (net != &init_net) {
2775 		memcpy(net->ipv4.sysctl_tcp_rmem,
2776 		       init_net.ipv4.sysctl_tcp_rmem,
2777 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2778 		memcpy(net->ipv4.sysctl_tcp_wmem,
2779 		       init_net.ipv4.sysctl_tcp_wmem,
2780 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2781 	}
2782 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2783 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2784 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2785 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2786 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2787 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2788 
2789 	/* Reno is always built in */
2790 	if (!net_eq(net, &init_net) &&
2791 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2792 			       init_net.ipv4.tcp_congestion_control->owner))
2793 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2794 	else
2795 		net->ipv4.tcp_congestion_control = &tcp_reno;
2796 
2797 	return 0;
2798 fail:
2799 	tcp_sk_exit(net);
2800 
2801 	return res;
2802 }
2803 
2804 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2805 {
2806 	struct net *net;
2807 
2808 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2809 
2810 	list_for_each_entry(net, net_exit_list, exit_list)
2811 		tcp_fastopen_ctx_destroy(net);
2812 }
2813 
2814 static struct pernet_operations __net_initdata tcp_sk_ops = {
2815        .init	   = tcp_sk_init,
2816        .exit	   = tcp_sk_exit,
2817        .exit_batch = tcp_sk_exit_batch,
2818 };
2819 
2820 void __init tcp_v4_init(void)
2821 {
2822 	if (register_pernet_subsys(&tcp_sk_ops))
2823 		panic("Failed to create the TCP control socket.\n");
2824 }
2825