xref: /linux/net/ipv4/tcp_ipv4.c (revision 221013afb459e5deb8bd08e29b37050af5586d1c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74 
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 
83 #include <crypto/hash.h>
84 #include <linux/scatterlist.h>
85 
86 #include <trace/events/tcp.h>
87 
88 #ifdef CONFIG_TCP_MD5SIG
89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
90 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 #endif
92 
93 struct inet_hashinfo tcp_hashinfo;
94 EXPORT_SYMBOL(tcp_hashinfo);
95 
96 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
97 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
98 };
99 
100 static DEFINE_MUTEX(tcp_exit_batch_mutex);
101 
102 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
103 {
104 	return secure_tcp_seq(ip_hdr(skb)->daddr,
105 			      ip_hdr(skb)->saddr,
106 			      tcp_hdr(skb)->dest,
107 			      tcp_hdr(skb)->source);
108 }
109 
110 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
111 {
112 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
113 }
114 
115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
116 {
117 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
118 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
119 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
120 	struct tcp_sock *tp = tcp_sk(sk);
121 	int ts_recent_stamp;
122 
123 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
124 		reuse = 0;
125 
126 	if (reuse == 2) {
127 		/* Still does not detect *everything* that goes through
128 		 * lo, since we require a loopback src or dst address
129 		 * or direct binding to 'lo' interface.
130 		 */
131 		bool loopback = false;
132 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
133 			loopback = true;
134 #if IS_ENABLED(CONFIG_IPV6)
135 		if (tw->tw_family == AF_INET6) {
136 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
137 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
138 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
139 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
140 				loopback = true;
141 		} else
142 #endif
143 		{
144 			if (ipv4_is_loopback(tw->tw_daddr) ||
145 			    ipv4_is_loopback(tw->tw_rcv_saddr))
146 				loopback = true;
147 		}
148 		if (!loopback)
149 			reuse = 0;
150 	}
151 
152 	/* With PAWS, it is safe from the viewpoint
153 	   of data integrity. Even without PAWS it is safe provided sequence
154 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
155 
156 	   Actually, the idea is close to VJ's one, only timestamp cache is
157 	   held not per host, but per port pair and TW bucket is used as state
158 	   holder.
159 
160 	   If TW bucket has been already destroyed we fall back to VJ's scheme
161 	   and use initial timestamp retrieved from peer table.
162 	 */
163 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
164 	if (ts_recent_stamp &&
165 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
166 					    ts_recent_stamp)))) {
167 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
168 		 * and releasing the bucket lock.
169 		 */
170 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
171 			return 0;
172 
173 		/* In case of repair and re-using TIME-WAIT sockets we still
174 		 * want to be sure that it is safe as above but honor the
175 		 * sequence numbers and time stamps set as part of the repair
176 		 * process.
177 		 *
178 		 * Without this check re-using a TIME-WAIT socket with TCP
179 		 * repair would accumulate a -1 on the repair assigned
180 		 * sequence number. The first time it is reused the sequence
181 		 * is -1, the second time -2, etc. This fixes that issue
182 		 * without appearing to create any others.
183 		 */
184 		if (likely(!tp->repair)) {
185 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
186 
187 			if (!seq)
188 				seq = 1;
189 			WRITE_ONCE(tp->write_seq, seq);
190 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
191 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
192 		}
193 
194 		return 1;
195 	}
196 
197 	return 0;
198 }
199 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
200 
201 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
202 			      int addr_len)
203 {
204 	/* This check is replicated from tcp_v4_connect() and intended to
205 	 * prevent BPF program called below from accessing bytes that are out
206 	 * of the bound specified by user in addr_len.
207 	 */
208 	if (addr_len < sizeof(struct sockaddr_in))
209 		return -EINVAL;
210 
211 	sock_owned_by_me(sk);
212 
213 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
214 }
215 
216 /* This will initiate an outgoing connection. */
217 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
218 {
219 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
220 	struct inet_timewait_death_row *tcp_death_row;
221 	struct inet_sock *inet = inet_sk(sk);
222 	struct tcp_sock *tp = tcp_sk(sk);
223 	struct ip_options_rcu *inet_opt;
224 	struct net *net = sock_net(sk);
225 	__be16 orig_sport, orig_dport;
226 	__be32 daddr, nexthop;
227 	struct flowi4 *fl4;
228 	struct rtable *rt;
229 	int err;
230 
231 	if (addr_len < sizeof(struct sockaddr_in))
232 		return -EINVAL;
233 
234 	if (usin->sin_family != AF_INET)
235 		return -EAFNOSUPPORT;
236 
237 	nexthop = daddr = usin->sin_addr.s_addr;
238 	inet_opt = rcu_dereference_protected(inet->inet_opt,
239 					     lockdep_sock_is_held(sk));
240 	if (inet_opt && inet_opt->opt.srr) {
241 		if (!daddr)
242 			return -EINVAL;
243 		nexthop = inet_opt->opt.faddr;
244 	}
245 
246 	orig_sport = inet->inet_sport;
247 	orig_dport = usin->sin_port;
248 	fl4 = &inet->cork.fl.u.ip4;
249 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
250 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
251 			      orig_dport, sk);
252 	if (IS_ERR(rt)) {
253 		err = PTR_ERR(rt);
254 		if (err == -ENETUNREACH)
255 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
256 		return err;
257 	}
258 
259 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
260 		ip_rt_put(rt);
261 		return -ENETUNREACH;
262 	}
263 
264 	if (!inet_opt || !inet_opt->opt.srr)
265 		daddr = fl4->daddr;
266 
267 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
268 
269 	if (!inet->inet_saddr) {
270 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
271 		if (err) {
272 			ip_rt_put(rt);
273 			return err;
274 		}
275 	} else {
276 		sk_rcv_saddr_set(sk, inet->inet_saddr);
277 	}
278 
279 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
280 		/* Reset inherited state */
281 		tp->rx_opt.ts_recent	   = 0;
282 		tp->rx_opt.ts_recent_stamp = 0;
283 		if (likely(!tp->repair))
284 			WRITE_ONCE(tp->write_seq, 0);
285 	}
286 
287 	inet->inet_dport = usin->sin_port;
288 	sk_daddr_set(sk, daddr);
289 
290 	inet_csk(sk)->icsk_ext_hdr_len = 0;
291 	if (inet_opt)
292 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
293 
294 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
295 
296 	/* Socket identity is still unknown (sport may be zero).
297 	 * However we set state to SYN-SENT and not releasing socket
298 	 * lock select source port, enter ourselves into the hash tables and
299 	 * complete initialization after this.
300 	 */
301 	tcp_set_state(sk, TCP_SYN_SENT);
302 	err = inet_hash_connect(tcp_death_row, sk);
303 	if (err)
304 		goto failure;
305 
306 	sk_set_txhash(sk);
307 
308 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
309 			       inet->inet_sport, inet->inet_dport, sk);
310 	if (IS_ERR(rt)) {
311 		err = PTR_ERR(rt);
312 		rt = NULL;
313 		goto failure;
314 	}
315 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
316 	/* OK, now commit destination to socket.  */
317 	sk->sk_gso_type = SKB_GSO_TCPV4;
318 	sk_setup_caps(sk, &rt->dst);
319 	rt = NULL;
320 
321 	if (likely(!tp->repair)) {
322 		if (!tp->write_seq)
323 			WRITE_ONCE(tp->write_seq,
324 				   secure_tcp_seq(inet->inet_saddr,
325 						  inet->inet_daddr,
326 						  inet->inet_sport,
327 						  usin->sin_port));
328 		WRITE_ONCE(tp->tsoffset,
329 			   secure_tcp_ts_off(net, inet->inet_saddr,
330 					     inet->inet_daddr));
331 	}
332 
333 	atomic_set(&inet->inet_id, get_random_u16());
334 
335 	if (tcp_fastopen_defer_connect(sk, &err))
336 		return err;
337 	if (err)
338 		goto failure;
339 
340 	err = tcp_connect(sk);
341 
342 	if (err)
343 		goto failure;
344 
345 	return 0;
346 
347 failure:
348 	/*
349 	 * This unhashes the socket and releases the local port,
350 	 * if necessary.
351 	 */
352 	tcp_set_state(sk, TCP_CLOSE);
353 	inet_bhash2_reset_saddr(sk);
354 	ip_rt_put(rt);
355 	sk->sk_route_caps = 0;
356 	inet->inet_dport = 0;
357 	return err;
358 }
359 EXPORT_SYMBOL(tcp_v4_connect);
360 
361 /*
362  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
363  * It can be called through tcp_release_cb() if socket was owned by user
364  * at the time tcp_v4_err() was called to handle ICMP message.
365  */
366 void tcp_v4_mtu_reduced(struct sock *sk)
367 {
368 	struct inet_sock *inet = inet_sk(sk);
369 	struct dst_entry *dst;
370 	u32 mtu;
371 
372 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
373 		return;
374 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
375 	dst = inet_csk_update_pmtu(sk, mtu);
376 	if (!dst)
377 		return;
378 
379 	/* Something is about to be wrong... Remember soft error
380 	 * for the case, if this connection will not able to recover.
381 	 */
382 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
383 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
384 
385 	mtu = dst_mtu(dst);
386 
387 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
388 	    ip_sk_accept_pmtu(sk) &&
389 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
390 		tcp_sync_mss(sk, mtu);
391 
392 		/* Resend the TCP packet because it's
393 		 * clear that the old packet has been
394 		 * dropped. This is the new "fast" path mtu
395 		 * discovery.
396 		 */
397 		tcp_simple_retransmit(sk);
398 	} /* else let the usual retransmit timer handle it */
399 }
400 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
401 
402 static void do_redirect(struct sk_buff *skb, struct sock *sk)
403 {
404 	struct dst_entry *dst = __sk_dst_check(sk, 0);
405 
406 	if (dst)
407 		dst->ops->redirect(dst, sk, skb);
408 }
409 
410 
411 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
412 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
413 {
414 	struct request_sock *req = inet_reqsk(sk);
415 	struct net *net = sock_net(sk);
416 
417 	/* ICMPs are not backlogged, hence we cannot get
418 	 * an established socket here.
419 	 */
420 	if (seq != tcp_rsk(req)->snt_isn) {
421 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
422 	} else if (abort) {
423 		/*
424 		 * Still in SYN_RECV, just remove it silently.
425 		 * There is no good way to pass the error to the newly
426 		 * created socket, and POSIX does not want network
427 		 * errors returned from accept().
428 		 */
429 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
430 		tcp_listendrop(req->rsk_listener);
431 	}
432 	reqsk_put(req);
433 }
434 EXPORT_SYMBOL(tcp_req_err);
435 
436 /* TCP-LD (RFC 6069) logic */
437 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
438 {
439 	struct inet_connection_sock *icsk = inet_csk(sk);
440 	struct tcp_sock *tp = tcp_sk(sk);
441 	struct sk_buff *skb;
442 	s32 remaining;
443 	u32 delta_us;
444 
445 	if (sock_owned_by_user(sk))
446 		return;
447 
448 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
449 	    !icsk->icsk_backoff)
450 		return;
451 
452 	skb = tcp_rtx_queue_head(sk);
453 	if (WARN_ON_ONCE(!skb))
454 		return;
455 
456 	icsk->icsk_backoff--;
457 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
458 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
459 
460 	tcp_mstamp_refresh(tp);
461 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
462 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
463 
464 	if (remaining > 0) {
465 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
466 					  remaining, TCP_RTO_MAX);
467 	} else {
468 		/* RTO revert clocked out retransmission.
469 		 * Will retransmit now.
470 		 */
471 		tcp_retransmit_timer(sk);
472 	}
473 }
474 EXPORT_SYMBOL(tcp_ld_RTO_revert);
475 
476 /*
477  * This routine is called by the ICMP module when it gets some
478  * sort of error condition.  If err < 0 then the socket should
479  * be closed and the error returned to the user.  If err > 0
480  * it's just the icmp type << 8 | icmp code.  After adjustment
481  * header points to the first 8 bytes of the tcp header.  We need
482  * to find the appropriate port.
483  *
484  * The locking strategy used here is very "optimistic". When
485  * someone else accesses the socket the ICMP is just dropped
486  * and for some paths there is no check at all.
487  * A more general error queue to queue errors for later handling
488  * is probably better.
489  *
490  */
491 
492 int tcp_v4_err(struct sk_buff *skb, u32 info)
493 {
494 	const struct iphdr *iph = (const struct iphdr *)skb->data;
495 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
496 	struct tcp_sock *tp;
497 	const int type = icmp_hdr(skb)->type;
498 	const int code = icmp_hdr(skb)->code;
499 	struct sock *sk;
500 	struct request_sock *fastopen;
501 	u32 seq, snd_una;
502 	int err;
503 	struct net *net = dev_net(skb->dev);
504 
505 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
506 				       iph->daddr, th->dest, iph->saddr,
507 				       ntohs(th->source), inet_iif(skb), 0);
508 	if (!sk) {
509 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
510 		return -ENOENT;
511 	}
512 	if (sk->sk_state == TCP_TIME_WAIT) {
513 		/* To increase the counter of ignored icmps for TCP-AO */
514 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
515 		inet_twsk_put(inet_twsk(sk));
516 		return 0;
517 	}
518 	seq = ntohl(th->seq);
519 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
520 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
521 				     type == ICMP_TIME_EXCEEDED ||
522 				     (type == ICMP_DEST_UNREACH &&
523 				      (code == ICMP_NET_UNREACH ||
524 				       code == ICMP_HOST_UNREACH)));
525 		return 0;
526 	}
527 
528 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
529 		sock_put(sk);
530 		return 0;
531 	}
532 
533 	bh_lock_sock(sk);
534 	/* If too many ICMPs get dropped on busy
535 	 * servers this needs to be solved differently.
536 	 * We do take care of PMTU discovery (RFC1191) special case :
537 	 * we can receive locally generated ICMP messages while socket is held.
538 	 */
539 	if (sock_owned_by_user(sk)) {
540 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
541 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
542 	}
543 	if (sk->sk_state == TCP_CLOSE)
544 		goto out;
545 
546 	if (static_branch_unlikely(&ip4_min_ttl)) {
547 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
548 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
549 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
550 			goto out;
551 		}
552 	}
553 
554 	tp = tcp_sk(sk);
555 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
556 	fastopen = rcu_dereference(tp->fastopen_rsk);
557 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
558 	if (sk->sk_state != TCP_LISTEN &&
559 	    !between(seq, snd_una, tp->snd_nxt)) {
560 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
561 		goto out;
562 	}
563 
564 	switch (type) {
565 	case ICMP_REDIRECT:
566 		if (!sock_owned_by_user(sk))
567 			do_redirect(skb, sk);
568 		goto out;
569 	case ICMP_SOURCE_QUENCH:
570 		/* Just silently ignore these. */
571 		goto out;
572 	case ICMP_PARAMETERPROB:
573 		err = EPROTO;
574 		break;
575 	case ICMP_DEST_UNREACH:
576 		if (code > NR_ICMP_UNREACH)
577 			goto out;
578 
579 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
580 			/* We are not interested in TCP_LISTEN and open_requests
581 			 * (SYN-ACKs send out by Linux are always <576bytes so
582 			 * they should go through unfragmented).
583 			 */
584 			if (sk->sk_state == TCP_LISTEN)
585 				goto out;
586 
587 			WRITE_ONCE(tp->mtu_info, info);
588 			if (!sock_owned_by_user(sk)) {
589 				tcp_v4_mtu_reduced(sk);
590 			} else {
591 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
592 					sock_hold(sk);
593 			}
594 			goto out;
595 		}
596 
597 		err = icmp_err_convert[code].errno;
598 		/* check if this ICMP message allows revert of backoff.
599 		 * (see RFC 6069)
600 		 */
601 		if (!fastopen &&
602 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
603 			tcp_ld_RTO_revert(sk, seq);
604 		break;
605 	case ICMP_TIME_EXCEEDED:
606 		err = EHOSTUNREACH;
607 		break;
608 	default:
609 		goto out;
610 	}
611 
612 	switch (sk->sk_state) {
613 	case TCP_SYN_SENT:
614 	case TCP_SYN_RECV:
615 		/* Only in fast or simultaneous open. If a fast open socket is
616 		 * already accepted it is treated as a connected one below.
617 		 */
618 		if (fastopen && !fastopen->sk)
619 			break;
620 
621 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
622 
623 		if (!sock_owned_by_user(sk))
624 			tcp_done_with_error(sk, err);
625 		else
626 			WRITE_ONCE(sk->sk_err_soft, err);
627 		goto out;
628 	}
629 
630 	/* If we've already connected we will keep trying
631 	 * until we time out, or the user gives up.
632 	 *
633 	 * rfc1122 4.2.3.9 allows to consider as hard errors
634 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
635 	 * but it is obsoleted by pmtu discovery).
636 	 *
637 	 * Note, that in modern internet, where routing is unreliable
638 	 * and in each dark corner broken firewalls sit, sending random
639 	 * errors ordered by their masters even this two messages finally lose
640 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
641 	 *
642 	 * Now we are in compliance with RFCs.
643 	 *							--ANK (980905)
644 	 */
645 
646 	if (!sock_owned_by_user(sk) &&
647 	    inet_test_bit(RECVERR, sk)) {
648 		WRITE_ONCE(sk->sk_err, err);
649 		sk_error_report(sk);
650 	} else	{ /* Only an error on timeout */
651 		WRITE_ONCE(sk->sk_err_soft, err);
652 	}
653 
654 out:
655 	bh_unlock_sock(sk);
656 	sock_put(sk);
657 	return 0;
658 }
659 
660 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
661 {
662 	struct tcphdr *th = tcp_hdr(skb);
663 
664 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
665 	skb->csum_start = skb_transport_header(skb) - skb->head;
666 	skb->csum_offset = offsetof(struct tcphdr, check);
667 }
668 
669 /* This routine computes an IPv4 TCP checksum. */
670 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
671 {
672 	const struct inet_sock *inet = inet_sk(sk);
673 
674 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
675 }
676 EXPORT_SYMBOL(tcp_v4_send_check);
677 
678 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
679 
680 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
681 				 const struct tcp_ao_hdr *aoh,
682 				 struct ip_reply_arg *arg, struct tcphdr *reply,
683 				 __be32 reply_options[REPLY_OPTIONS_LEN])
684 {
685 #ifdef CONFIG_TCP_AO
686 	int sdif = tcp_v4_sdif(skb);
687 	int dif = inet_iif(skb);
688 	int l3index = sdif ? dif : 0;
689 	bool allocated_traffic_key;
690 	struct tcp_ao_key *key;
691 	char *traffic_key;
692 	bool drop = true;
693 	u32 ao_sne = 0;
694 	u8 keyid;
695 
696 	rcu_read_lock();
697 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
698 				 &key, &traffic_key, &allocated_traffic_key,
699 				 &keyid, &ao_sne))
700 		goto out;
701 
702 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
703 				 (aoh->rnext_keyid << 8) | keyid);
704 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
705 	reply->doff = arg->iov[0].iov_len / 4;
706 
707 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
708 			    key, traffic_key,
709 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
710 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
711 			    reply, ao_sne))
712 		goto out;
713 	drop = false;
714 out:
715 	rcu_read_unlock();
716 	if (allocated_traffic_key)
717 		kfree(traffic_key);
718 	return drop;
719 #else
720 	return true;
721 #endif
722 }
723 
724 /*
725  *	This routine will send an RST to the other tcp.
726  *
727  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
728  *		      for reset.
729  *	Answer: if a packet caused RST, it is not for a socket
730  *		existing in our system, if it is matched to a socket,
731  *		it is just duplicate segment or bug in other side's TCP.
732  *		So that we build reply only basing on parameters
733  *		arrived with segment.
734  *	Exception: precedence violation. We do not implement it in any case.
735  */
736 
737 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
738 			      enum sk_rst_reason reason)
739 {
740 	const struct tcphdr *th = tcp_hdr(skb);
741 	struct {
742 		struct tcphdr th;
743 		__be32 opt[REPLY_OPTIONS_LEN];
744 	} rep;
745 	const __u8 *md5_hash_location = NULL;
746 	const struct tcp_ao_hdr *aoh;
747 	struct ip_reply_arg arg;
748 #ifdef CONFIG_TCP_MD5SIG
749 	struct tcp_md5sig_key *key = NULL;
750 	unsigned char newhash[16];
751 	struct sock *sk1 = NULL;
752 	int genhash;
753 #endif
754 	u64 transmit_time = 0;
755 	struct sock *ctl_sk;
756 	struct net *net;
757 	u32 txhash = 0;
758 
759 	/* Never send a reset in response to a reset. */
760 	if (th->rst)
761 		return;
762 
763 	/* If sk not NULL, it means we did a successful lookup and incoming
764 	 * route had to be correct. prequeue might have dropped our dst.
765 	 */
766 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
767 		return;
768 
769 	/* Swap the send and the receive. */
770 	memset(&rep, 0, sizeof(rep));
771 	rep.th.dest   = th->source;
772 	rep.th.source = th->dest;
773 	rep.th.doff   = sizeof(struct tcphdr) / 4;
774 	rep.th.rst    = 1;
775 
776 	if (th->ack) {
777 		rep.th.seq = th->ack_seq;
778 	} else {
779 		rep.th.ack = 1;
780 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
781 				       skb->len - (th->doff << 2));
782 	}
783 
784 	memset(&arg, 0, sizeof(arg));
785 	arg.iov[0].iov_base = (unsigned char *)&rep;
786 	arg.iov[0].iov_len  = sizeof(rep.th);
787 
788 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
789 
790 	/* Invalid TCP option size or twice included auth */
791 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
792 		return;
793 
794 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
795 		return;
796 
797 #ifdef CONFIG_TCP_MD5SIG
798 	rcu_read_lock();
799 	if (sk && sk_fullsock(sk)) {
800 		const union tcp_md5_addr *addr;
801 		int l3index;
802 
803 		/* sdif set, means packet ingressed via a device
804 		 * in an L3 domain and inet_iif is set to it.
805 		 */
806 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
807 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
808 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
809 	} else if (md5_hash_location) {
810 		const union tcp_md5_addr *addr;
811 		int sdif = tcp_v4_sdif(skb);
812 		int dif = inet_iif(skb);
813 		int l3index;
814 
815 		/*
816 		 * active side is lost. Try to find listening socket through
817 		 * source port, and then find md5 key through listening socket.
818 		 * we are not loose security here:
819 		 * Incoming packet is checked with md5 hash with finding key,
820 		 * no RST generated if md5 hash doesn't match.
821 		 */
822 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
823 					     NULL, 0, ip_hdr(skb)->saddr,
824 					     th->source, ip_hdr(skb)->daddr,
825 					     ntohs(th->source), dif, sdif);
826 		/* don't send rst if it can't find key */
827 		if (!sk1)
828 			goto out;
829 
830 		/* sdif set, means packet ingressed via a device
831 		 * in an L3 domain and dif is set to it.
832 		 */
833 		l3index = sdif ? dif : 0;
834 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
835 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
836 		if (!key)
837 			goto out;
838 
839 
840 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
841 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
842 			goto out;
843 
844 	}
845 
846 	if (key) {
847 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
848 				   (TCPOPT_NOP << 16) |
849 				   (TCPOPT_MD5SIG << 8) |
850 				   TCPOLEN_MD5SIG);
851 		/* Update length and the length the header thinks exists */
852 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
853 		rep.th.doff = arg.iov[0].iov_len / 4;
854 
855 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
856 				     key, ip_hdr(skb)->saddr,
857 				     ip_hdr(skb)->daddr, &rep.th);
858 	}
859 #endif
860 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
861 	if (rep.opt[0] == 0) {
862 		__be32 mrst = mptcp_reset_option(skb);
863 
864 		if (mrst) {
865 			rep.opt[0] = mrst;
866 			arg.iov[0].iov_len += sizeof(mrst);
867 			rep.th.doff = arg.iov[0].iov_len / 4;
868 		}
869 	}
870 
871 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
872 				      ip_hdr(skb)->saddr, /* XXX */
873 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
874 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
875 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
876 
877 	/* When socket is gone, all binding information is lost.
878 	 * routing might fail in this case. No choice here, if we choose to force
879 	 * input interface, we will misroute in case of asymmetric route.
880 	 */
881 	if (sk)
882 		arg.bound_dev_if = sk->sk_bound_dev_if;
883 
884 	trace_tcp_send_reset(sk, skb, reason);
885 
886 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
887 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
888 
889 	arg.tos = ip_hdr(skb)->tos;
890 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
891 	local_bh_disable();
892 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
893 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
894 
895 	sock_net_set(ctl_sk, net);
896 	if (sk) {
897 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
898 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
899 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
900 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
901 		transmit_time = tcp_transmit_time(sk);
902 		xfrm_sk_clone_policy(ctl_sk, sk);
903 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
904 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
905 	} else {
906 		ctl_sk->sk_mark = 0;
907 		ctl_sk->sk_priority = 0;
908 	}
909 	ip_send_unicast_reply(ctl_sk,
910 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
911 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
912 			      &arg, arg.iov[0].iov_len,
913 			      transmit_time, txhash);
914 
915 	xfrm_sk_free_policy(ctl_sk);
916 	sock_net_set(ctl_sk, &init_net);
917 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
918 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
919 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
920 	local_bh_enable();
921 
922 #ifdef CONFIG_TCP_MD5SIG
923 out:
924 	rcu_read_unlock();
925 #endif
926 }
927 
928 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
929    outside socket context is ugly, certainly. What can I do?
930  */
931 
932 static void tcp_v4_send_ack(const struct sock *sk,
933 			    struct sk_buff *skb, u32 seq, u32 ack,
934 			    u32 win, u32 tsval, u32 tsecr, int oif,
935 			    struct tcp_key *key,
936 			    int reply_flags, u8 tos, u32 txhash)
937 {
938 	const struct tcphdr *th = tcp_hdr(skb);
939 	struct {
940 		struct tcphdr th;
941 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
942 	} rep;
943 	struct net *net = sock_net(sk);
944 	struct ip_reply_arg arg;
945 	struct sock *ctl_sk;
946 	u64 transmit_time;
947 
948 	memset(&rep.th, 0, sizeof(struct tcphdr));
949 	memset(&arg, 0, sizeof(arg));
950 
951 	arg.iov[0].iov_base = (unsigned char *)&rep;
952 	arg.iov[0].iov_len  = sizeof(rep.th);
953 	if (tsecr) {
954 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
955 				   (TCPOPT_TIMESTAMP << 8) |
956 				   TCPOLEN_TIMESTAMP);
957 		rep.opt[1] = htonl(tsval);
958 		rep.opt[2] = htonl(tsecr);
959 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
960 	}
961 
962 	/* Swap the send and the receive. */
963 	rep.th.dest    = th->source;
964 	rep.th.source  = th->dest;
965 	rep.th.doff    = arg.iov[0].iov_len / 4;
966 	rep.th.seq     = htonl(seq);
967 	rep.th.ack_seq = htonl(ack);
968 	rep.th.ack     = 1;
969 	rep.th.window  = htons(win);
970 
971 #ifdef CONFIG_TCP_MD5SIG
972 	if (tcp_key_is_md5(key)) {
973 		int offset = (tsecr) ? 3 : 0;
974 
975 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
976 					  (TCPOPT_NOP << 16) |
977 					  (TCPOPT_MD5SIG << 8) |
978 					  TCPOLEN_MD5SIG);
979 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
980 		rep.th.doff = arg.iov[0].iov_len/4;
981 
982 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
983 				    key->md5_key, ip_hdr(skb)->saddr,
984 				    ip_hdr(skb)->daddr, &rep.th);
985 	}
986 #endif
987 #ifdef CONFIG_TCP_AO
988 	if (tcp_key_is_ao(key)) {
989 		int offset = (tsecr) ? 3 : 0;
990 
991 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
992 					  (tcp_ao_len(key->ao_key) << 16) |
993 					  (key->ao_key->sndid << 8) |
994 					  key->rcv_next);
995 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
996 		rep.th.doff = arg.iov[0].iov_len / 4;
997 
998 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
999 				key->ao_key, key->traffic_key,
1000 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1001 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1002 				&rep.th, key->sne);
1003 	}
1004 #endif
1005 	arg.flags = reply_flags;
1006 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1007 				      ip_hdr(skb)->saddr, /* XXX */
1008 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1009 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1010 	if (oif)
1011 		arg.bound_dev_if = oif;
1012 	arg.tos = tos;
1013 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1014 	local_bh_disable();
1015 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1016 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1017 	sock_net_set(ctl_sk, net);
1018 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1019 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1020 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1021 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1022 	transmit_time = tcp_transmit_time(sk);
1023 	ip_send_unicast_reply(ctl_sk,
1024 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1025 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1026 			      &arg, arg.iov[0].iov_len,
1027 			      transmit_time, txhash);
1028 
1029 	sock_net_set(ctl_sk, &init_net);
1030 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1031 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1032 	local_bh_enable();
1033 }
1034 
1035 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1036 {
1037 	struct inet_timewait_sock *tw = inet_twsk(sk);
1038 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1039 	struct tcp_key key = {};
1040 #ifdef CONFIG_TCP_AO
1041 	struct tcp_ao_info *ao_info;
1042 
1043 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1044 		/* FIXME: the segment to-be-acked is not verified yet */
1045 		ao_info = rcu_dereference(tcptw->ao_info);
1046 		if (ao_info) {
1047 			const struct tcp_ao_hdr *aoh;
1048 
1049 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1050 				inet_twsk_put(tw);
1051 				return;
1052 			}
1053 
1054 			if (aoh)
1055 				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1056 		}
1057 	}
1058 	if (key.ao_key) {
1059 		struct tcp_ao_key *rnext_key;
1060 
1061 		key.traffic_key = snd_other_key(key.ao_key);
1062 		key.sne = READ_ONCE(ao_info->snd_sne);
1063 		rnext_key = READ_ONCE(ao_info->rnext_key);
1064 		key.rcv_next = rnext_key->rcvid;
1065 		key.type = TCP_KEY_AO;
1066 #else
1067 	if (0) {
1068 #endif
1069 	} else if (static_branch_tcp_md5()) {
1070 		key.md5_key = tcp_twsk_md5_key(tcptw);
1071 		if (key.md5_key)
1072 			key.type = TCP_KEY_MD5;
1073 	}
1074 
1075 	tcp_v4_send_ack(sk, skb,
1076 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1077 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1078 			tcp_tw_tsval(tcptw),
1079 			READ_ONCE(tcptw->tw_ts_recent),
1080 			tw->tw_bound_dev_if, &key,
1081 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1082 			tw->tw_tos,
1083 			tw->tw_txhash);
1084 
1085 	inet_twsk_put(tw);
1086 }
1087 
1088 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1089 				  struct request_sock *req)
1090 {
1091 	struct tcp_key key = {};
1092 
1093 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1094 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1095 	 */
1096 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1097 					     tcp_sk(sk)->snd_nxt;
1098 
1099 #ifdef CONFIG_TCP_AO
1100 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1101 	    tcp_rsk_used_ao(req)) {
1102 		const union tcp_md5_addr *addr;
1103 		const struct tcp_ao_hdr *aoh;
1104 		int l3index;
1105 
1106 		/* Invalid TCP option size or twice included auth */
1107 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1108 			return;
1109 		if (!aoh)
1110 			return;
1111 
1112 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1113 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1114 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1115 					      aoh->rnext_keyid, -1);
1116 		if (unlikely(!key.ao_key)) {
1117 			/* Send ACK with any matching MKT for the peer */
1118 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1119 			/* Matching key disappeared (user removed the key?)
1120 			 * let the handshake timeout.
1121 			 */
1122 			if (!key.ao_key) {
1123 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1124 						     addr,
1125 						     ntohs(tcp_hdr(skb)->source),
1126 						     &ip_hdr(skb)->daddr,
1127 						     ntohs(tcp_hdr(skb)->dest));
1128 				return;
1129 			}
1130 		}
1131 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1132 		if (!key.traffic_key)
1133 			return;
1134 
1135 		key.type = TCP_KEY_AO;
1136 		key.rcv_next = aoh->keyid;
1137 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1138 #else
1139 	if (0) {
1140 #endif
1141 	} else if (static_branch_tcp_md5()) {
1142 		const union tcp_md5_addr *addr;
1143 		int l3index;
1144 
1145 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1146 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1147 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1148 		if (key.md5_key)
1149 			key.type = TCP_KEY_MD5;
1150 	}
1151 
1152 	tcp_v4_send_ack(sk, skb, seq,
1153 			tcp_rsk(req)->rcv_nxt,
1154 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1155 			tcp_rsk_tsval(tcp_rsk(req)),
1156 			READ_ONCE(req->ts_recent),
1157 			0, &key,
1158 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1159 			ip_hdr(skb)->tos,
1160 			READ_ONCE(tcp_rsk(req)->txhash));
1161 	if (tcp_key_is_ao(&key))
1162 		kfree(key.traffic_key);
1163 }
1164 
1165 /*
1166  *	Send a SYN-ACK after having received a SYN.
1167  *	This still operates on a request_sock only, not on a big
1168  *	socket.
1169  */
1170 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1171 			      struct flowi *fl,
1172 			      struct request_sock *req,
1173 			      struct tcp_fastopen_cookie *foc,
1174 			      enum tcp_synack_type synack_type,
1175 			      struct sk_buff *syn_skb)
1176 {
1177 	const struct inet_request_sock *ireq = inet_rsk(req);
1178 	struct flowi4 fl4;
1179 	int err = -1;
1180 	struct sk_buff *skb;
1181 	u8 tos;
1182 
1183 	/* First, grab a route. */
1184 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1185 		return -1;
1186 
1187 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1188 
1189 	if (skb) {
1190 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1191 
1192 		tos = READ_ONCE(inet_sk(sk)->tos);
1193 
1194 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1195 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1196 			      (tos & INET_ECN_MASK);
1197 
1198 		if (!INET_ECN_is_capable(tos) &&
1199 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1200 			tos |= INET_ECN_ECT_0;
1201 
1202 		rcu_read_lock();
1203 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1204 					    ireq->ir_rmt_addr,
1205 					    rcu_dereference(ireq->ireq_opt),
1206 					    tos);
1207 		rcu_read_unlock();
1208 		err = net_xmit_eval(err);
1209 	}
1210 
1211 	return err;
1212 }
1213 
1214 /*
1215  *	IPv4 request_sock destructor.
1216  */
1217 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1218 {
1219 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1220 }
1221 
1222 #ifdef CONFIG_TCP_MD5SIG
1223 /*
1224  * RFC2385 MD5 checksumming requires a mapping of
1225  * IP address->MD5 Key.
1226  * We need to maintain these in the sk structure.
1227  */
1228 
1229 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1230 EXPORT_SYMBOL(tcp_md5_needed);
1231 
1232 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1233 {
1234 	if (!old)
1235 		return true;
1236 
1237 	/* l3index always overrides non-l3index */
1238 	if (old->l3index && new->l3index == 0)
1239 		return false;
1240 	if (old->l3index == 0 && new->l3index)
1241 		return true;
1242 
1243 	return old->prefixlen < new->prefixlen;
1244 }
1245 
1246 /* Find the Key structure for an address.  */
1247 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1248 					   const union tcp_md5_addr *addr,
1249 					   int family, bool any_l3index)
1250 {
1251 	const struct tcp_sock *tp = tcp_sk(sk);
1252 	struct tcp_md5sig_key *key;
1253 	const struct tcp_md5sig_info *md5sig;
1254 	__be32 mask;
1255 	struct tcp_md5sig_key *best_match = NULL;
1256 	bool match;
1257 
1258 	/* caller either holds rcu_read_lock() or socket lock */
1259 	md5sig = rcu_dereference_check(tp->md5sig_info,
1260 				       lockdep_sock_is_held(sk));
1261 	if (!md5sig)
1262 		return NULL;
1263 
1264 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1265 				 lockdep_sock_is_held(sk)) {
1266 		if (key->family != family)
1267 			continue;
1268 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1269 		    key->l3index != l3index)
1270 			continue;
1271 		if (family == AF_INET) {
1272 			mask = inet_make_mask(key->prefixlen);
1273 			match = (key->addr.a4.s_addr & mask) ==
1274 				(addr->a4.s_addr & mask);
1275 #if IS_ENABLED(CONFIG_IPV6)
1276 		} else if (family == AF_INET6) {
1277 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1278 						  key->prefixlen);
1279 #endif
1280 		} else {
1281 			match = false;
1282 		}
1283 
1284 		if (match && better_md5_match(best_match, key))
1285 			best_match = key;
1286 	}
1287 	return best_match;
1288 }
1289 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1290 
1291 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1292 						      const union tcp_md5_addr *addr,
1293 						      int family, u8 prefixlen,
1294 						      int l3index, u8 flags)
1295 {
1296 	const struct tcp_sock *tp = tcp_sk(sk);
1297 	struct tcp_md5sig_key *key;
1298 	unsigned int size = sizeof(struct in_addr);
1299 	const struct tcp_md5sig_info *md5sig;
1300 
1301 	/* caller either holds rcu_read_lock() or socket lock */
1302 	md5sig = rcu_dereference_check(tp->md5sig_info,
1303 				       lockdep_sock_is_held(sk));
1304 	if (!md5sig)
1305 		return NULL;
1306 #if IS_ENABLED(CONFIG_IPV6)
1307 	if (family == AF_INET6)
1308 		size = sizeof(struct in6_addr);
1309 #endif
1310 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1311 				 lockdep_sock_is_held(sk)) {
1312 		if (key->family != family)
1313 			continue;
1314 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1315 			continue;
1316 		if (key->l3index != l3index)
1317 			continue;
1318 		if (!memcmp(&key->addr, addr, size) &&
1319 		    key->prefixlen == prefixlen)
1320 			return key;
1321 	}
1322 	return NULL;
1323 }
1324 
1325 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1326 					 const struct sock *addr_sk)
1327 {
1328 	const union tcp_md5_addr *addr;
1329 	int l3index;
1330 
1331 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1332 						 addr_sk->sk_bound_dev_if);
1333 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1334 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1335 }
1336 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1337 
1338 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1339 {
1340 	struct tcp_sock *tp = tcp_sk(sk);
1341 	struct tcp_md5sig_info *md5sig;
1342 
1343 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1344 	if (!md5sig)
1345 		return -ENOMEM;
1346 
1347 	sk_gso_disable(sk);
1348 	INIT_HLIST_HEAD(&md5sig->head);
1349 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1350 	return 0;
1351 }
1352 
1353 /* This can be called on a newly created socket, from other files */
1354 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1355 			    int family, u8 prefixlen, int l3index, u8 flags,
1356 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1357 {
1358 	/* Add Key to the list */
1359 	struct tcp_md5sig_key *key;
1360 	struct tcp_sock *tp = tcp_sk(sk);
1361 	struct tcp_md5sig_info *md5sig;
1362 
1363 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1364 	if (key) {
1365 		/* Pre-existing entry - just update that one.
1366 		 * Note that the key might be used concurrently.
1367 		 * data_race() is telling kcsan that we do not care of
1368 		 * key mismatches, since changing MD5 key on live flows
1369 		 * can lead to packet drops.
1370 		 */
1371 		data_race(memcpy(key->key, newkey, newkeylen));
1372 
1373 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1374 		 * Also note that a reader could catch new key->keylen value
1375 		 * but old key->key[], this is the reason we use __GFP_ZERO
1376 		 * at sock_kmalloc() time below these lines.
1377 		 */
1378 		WRITE_ONCE(key->keylen, newkeylen);
1379 
1380 		return 0;
1381 	}
1382 
1383 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1384 					   lockdep_sock_is_held(sk));
1385 
1386 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1387 	if (!key)
1388 		return -ENOMEM;
1389 
1390 	memcpy(key->key, newkey, newkeylen);
1391 	key->keylen = newkeylen;
1392 	key->family = family;
1393 	key->prefixlen = prefixlen;
1394 	key->l3index = l3index;
1395 	key->flags = flags;
1396 	memcpy(&key->addr, addr,
1397 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1398 								 sizeof(struct in_addr));
1399 	hlist_add_head_rcu(&key->node, &md5sig->head);
1400 	return 0;
1401 }
1402 
1403 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1404 		   int family, u8 prefixlen, int l3index, u8 flags,
1405 		   const u8 *newkey, u8 newkeylen)
1406 {
1407 	struct tcp_sock *tp = tcp_sk(sk);
1408 
1409 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1410 		if (tcp_md5_alloc_sigpool())
1411 			return -ENOMEM;
1412 
1413 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1414 			tcp_md5_release_sigpool();
1415 			return -ENOMEM;
1416 		}
1417 
1418 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1419 			struct tcp_md5sig_info *md5sig;
1420 
1421 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1422 			rcu_assign_pointer(tp->md5sig_info, NULL);
1423 			kfree_rcu(md5sig, rcu);
1424 			tcp_md5_release_sigpool();
1425 			return -EUSERS;
1426 		}
1427 	}
1428 
1429 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1430 				newkey, newkeylen, GFP_KERNEL);
1431 }
1432 EXPORT_SYMBOL(tcp_md5_do_add);
1433 
1434 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1435 		     int family, u8 prefixlen, int l3index,
1436 		     struct tcp_md5sig_key *key)
1437 {
1438 	struct tcp_sock *tp = tcp_sk(sk);
1439 
1440 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1441 		tcp_md5_add_sigpool();
1442 
1443 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1444 			tcp_md5_release_sigpool();
1445 			return -ENOMEM;
1446 		}
1447 
1448 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1449 			struct tcp_md5sig_info *md5sig;
1450 
1451 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1452 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1453 			rcu_assign_pointer(tp->md5sig_info, NULL);
1454 			kfree_rcu(md5sig, rcu);
1455 			tcp_md5_release_sigpool();
1456 			return -EUSERS;
1457 		}
1458 	}
1459 
1460 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1461 				key->flags, key->key, key->keylen,
1462 				sk_gfp_mask(sk, GFP_ATOMIC));
1463 }
1464 EXPORT_SYMBOL(tcp_md5_key_copy);
1465 
1466 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1467 		   u8 prefixlen, int l3index, u8 flags)
1468 {
1469 	struct tcp_md5sig_key *key;
1470 
1471 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1472 	if (!key)
1473 		return -ENOENT;
1474 	hlist_del_rcu(&key->node);
1475 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1476 	kfree_rcu(key, rcu);
1477 	return 0;
1478 }
1479 EXPORT_SYMBOL(tcp_md5_do_del);
1480 
1481 void tcp_clear_md5_list(struct sock *sk)
1482 {
1483 	struct tcp_sock *tp = tcp_sk(sk);
1484 	struct tcp_md5sig_key *key;
1485 	struct hlist_node *n;
1486 	struct tcp_md5sig_info *md5sig;
1487 
1488 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1489 
1490 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1491 		hlist_del_rcu(&key->node);
1492 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1493 		kfree_rcu(key, rcu);
1494 	}
1495 }
1496 
1497 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1498 				 sockptr_t optval, int optlen)
1499 {
1500 	struct tcp_md5sig cmd;
1501 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1502 	const union tcp_md5_addr *addr;
1503 	u8 prefixlen = 32;
1504 	int l3index = 0;
1505 	bool l3flag;
1506 	u8 flags;
1507 
1508 	if (optlen < sizeof(cmd))
1509 		return -EINVAL;
1510 
1511 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1512 		return -EFAULT;
1513 
1514 	if (sin->sin_family != AF_INET)
1515 		return -EINVAL;
1516 
1517 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1518 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1519 
1520 	if (optname == TCP_MD5SIG_EXT &&
1521 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1522 		prefixlen = cmd.tcpm_prefixlen;
1523 		if (prefixlen > 32)
1524 			return -EINVAL;
1525 	}
1526 
1527 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1528 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1529 		struct net_device *dev;
1530 
1531 		rcu_read_lock();
1532 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1533 		if (dev && netif_is_l3_master(dev))
1534 			l3index = dev->ifindex;
1535 
1536 		rcu_read_unlock();
1537 
1538 		/* ok to reference set/not set outside of rcu;
1539 		 * right now device MUST be an L3 master
1540 		 */
1541 		if (!dev || !l3index)
1542 			return -EINVAL;
1543 	}
1544 
1545 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1546 
1547 	if (!cmd.tcpm_keylen)
1548 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1549 
1550 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1551 		return -EINVAL;
1552 
1553 	/* Don't allow keys for peers that have a matching TCP-AO key.
1554 	 * See the comment in tcp_ao_add_cmd()
1555 	 */
1556 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1557 		return -EKEYREJECTED;
1558 
1559 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1560 			      cmd.tcpm_key, cmd.tcpm_keylen);
1561 }
1562 
1563 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1564 				   __be32 daddr, __be32 saddr,
1565 				   const struct tcphdr *th, int nbytes)
1566 {
1567 	struct tcp4_pseudohdr *bp;
1568 	struct scatterlist sg;
1569 	struct tcphdr *_th;
1570 
1571 	bp = hp->scratch;
1572 	bp->saddr = saddr;
1573 	bp->daddr = daddr;
1574 	bp->pad = 0;
1575 	bp->protocol = IPPROTO_TCP;
1576 	bp->len = cpu_to_be16(nbytes);
1577 
1578 	_th = (struct tcphdr *)(bp + 1);
1579 	memcpy(_th, th, sizeof(*th));
1580 	_th->check = 0;
1581 
1582 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1583 	ahash_request_set_crypt(hp->req, &sg, NULL,
1584 				sizeof(*bp) + sizeof(*th));
1585 	return crypto_ahash_update(hp->req);
1586 }
1587 
1588 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1589 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1590 {
1591 	struct tcp_sigpool hp;
1592 
1593 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1594 		goto clear_hash_nostart;
1595 
1596 	if (crypto_ahash_init(hp.req))
1597 		goto clear_hash;
1598 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1599 		goto clear_hash;
1600 	if (tcp_md5_hash_key(&hp, key))
1601 		goto clear_hash;
1602 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1603 	if (crypto_ahash_final(hp.req))
1604 		goto clear_hash;
1605 
1606 	tcp_sigpool_end(&hp);
1607 	return 0;
1608 
1609 clear_hash:
1610 	tcp_sigpool_end(&hp);
1611 clear_hash_nostart:
1612 	memset(md5_hash, 0, 16);
1613 	return 1;
1614 }
1615 
1616 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1617 			const struct sock *sk,
1618 			const struct sk_buff *skb)
1619 {
1620 	const struct tcphdr *th = tcp_hdr(skb);
1621 	struct tcp_sigpool hp;
1622 	__be32 saddr, daddr;
1623 
1624 	if (sk) { /* valid for establish/request sockets */
1625 		saddr = sk->sk_rcv_saddr;
1626 		daddr = sk->sk_daddr;
1627 	} else {
1628 		const struct iphdr *iph = ip_hdr(skb);
1629 		saddr = iph->saddr;
1630 		daddr = iph->daddr;
1631 	}
1632 
1633 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1634 		goto clear_hash_nostart;
1635 
1636 	if (crypto_ahash_init(hp.req))
1637 		goto clear_hash;
1638 
1639 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1640 		goto clear_hash;
1641 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1642 		goto clear_hash;
1643 	if (tcp_md5_hash_key(&hp, key))
1644 		goto clear_hash;
1645 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1646 	if (crypto_ahash_final(hp.req))
1647 		goto clear_hash;
1648 
1649 	tcp_sigpool_end(&hp);
1650 	return 0;
1651 
1652 clear_hash:
1653 	tcp_sigpool_end(&hp);
1654 clear_hash_nostart:
1655 	memset(md5_hash, 0, 16);
1656 	return 1;
1657 }
1658 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1659 
1660 #endif
1661 
1662 static void tcp_v4_init_req(struct request_sock *req,
1663 			    const struct sock *sk_listener,
1664 			    struct sk_buff *skb)
1665 {
1666 	struct inet_request_sock *ireq = inet_rsk(req);
1667 	struct net *net = sock_net(sk_listener);
1668 
1669 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1670 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1671 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1672 }
1673 
1674 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1675 					  struct sk_buff *skb,
1676 					  struct flowi *fl,
1677 					  struct request_sock *req,
1678 					  u32 tw_isn)
1679 {
1680 	tcp_v4_init_req(req, sk, skb);
1681 
1682 	if (security_inet_conn_request(sk, skb, req))
1683 		return NULL;
1684 
1685 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1686 }
1687 
1688 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1689 	.family		=	PF_INET,
1690 	.obj_size	=	sizeof(struct tcp_request_sock),
1691 	.rtx_syn_ack	=	tcp_rtx_synack,
1692 	.send_ack	=	tcp_v4_reqsk_send_ack,
1693 	.destructor	=	tcp_v4_reqsk_destructor,
1694 	.send_reset	=	tcp_v4_send_reset,
1695 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1696 };
1697 
1698 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1699 	.mss_clamp	=	TCP_MSS_DEFAULT,
1700 #ifdef CONFIG_TCP_MD5SIG
1701 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1702 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1703 #endif
1704 #ifdef CONFIG_TCP_AO
1705 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1706 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1707 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1708 #endif
1709 #ifdef CONFIG_SYN_COOKIES
1710 	.cookie_init_seq =	cookie_v4_init_sequence,
1711 #endif
1712 	.route_req	=	tcp_v4_route_req,
1713 	.init_seq	=	tcp_v4_init_seq,
1714 	.init_ts_off	=	tcp_v4_init_ts_off,
1715 	.send_synack	=	tcp_v4_send_synack,
1716 };
1717 
1718 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1719 {
1720 	/* Never answer to SYNs send to broadcast or multicast */
1721 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1722 		goto drop;
1723 
1724 	return tcp_conn_request(&tcp_request_sock_ops,
1725 				&tcp_request_sock_ipv4_ops, sk, skb);
1726 
1727 drop:
1728 	tcp_listendrop(sk);
1729 	return 0;
1730 }
1731 EXPORT_SYMBOL(tcp_v4_conn_request);
1732 
1733 
1734 /*
1735  * The three way handshake has completed - we got a valid synack -
1736  * now create the new socket.
1737  */
1738 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1739 				  struct request_sock *req,
1740 				  struct dst_entry *dst,
1741 				  struct request_sock *req_unhash,
1742 				  bool *own_req)
1743 {
1744 	struct inet_request_sock *ireq;
1745 	bool found_dup_sk = false;
1746 	struct inet_sock *newinet;
1747 	struct tcp_sock *newtp;
1748 	struct sock *newsk;
1749 #ifdef CONFIG_TCP_MD5SIG
1750 	const union tcp_md5_addr *addr;
1751 	struct tcp_md5sig_key *key;
1752 	int l3index;
1753 #endif
1754 	struct ip_options_rcu *inet_opt;
1755 
1756 	if (sk_acceptq_is_full(sk))
1757 		goto exit_overflow;
1758 
1759 	newsk = tcp_create_openreq_child(sk, req, skb);
1760 	if (!newsk)
1761 		goto exit_nonewsk;
1762 
1763 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1764 	inet_sk_rx_dst_set(newsk, skb);
1765 
1766 	newtp		      = tcp_sk(newsk);
1767 	newinet		      = inet_sk(newsk);
1768 	ireq		      = inet_rsk(req);
1769 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1770 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1771 	newsk->sk_bound_dev_if = ireq->ir_iif;
1772 	newinet->inet_saddr   = ireq->ir_loc_addr;
1773 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1774 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1775 	newinet->mc_index     = inet_iif(skb);
1776 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1777 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1778 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1779 	if (inet_opt)
1780 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1781 	atomic_set(&newinet->inet_id, get_random_u16());
1782 
1783 	/* Set ToS of the new socket based upon the value of incoming SYN.
1784 	 * ECT bits are set later in tcp_init_transfer().
1785 	 */
1786 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1787 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1788 
1789 	if (!dst) {
1790 		dst = inet_csk_route_child_sock(sk, newsk, req);
1791 		if (!dst)
1792 			goto put_and_exit;
1793 	} else {
1794 		/* syncookie case : see end of cookie_v4_check() */
1795 	}
1796 	sk_setup_caps(newsk, dst);
1797 
1798 	tcp_ca_openreq_child(newsk, dst);
1799 
1800 	tcp_sync_mss(newsk, dst_mtu(dst));
1801 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1802 
1803 	tcp_initialize_rcv_mss(newsk);
1804 
1805 #ifdef CONFIG_TCP_MD5SIG
1806 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1807 	/* Copy over the MD5 key from the original socket */
1808 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1809 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1810 	if (key && !tcp_rsk_used_ao(req)) {
1811 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1812 			goto put_and_exit;
1813 		sk_gso_disable(newsk);
1814 	}
1815 #endif
1816 #ifdef CONFIG_TCP_AO
1817 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1818 		goto put_and_exit; /* OOM, release back memory */
1819 #endif
1820 
1821 	if (__inet_inherit_port(sk, newsk) < 0)
1822 		goto put_and_exit;
1823 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1824 				       &found_dup_sk);
1825 	if (likely(*own_req)) {
1826 		tcp_move_syn(newtp, req);
1827 		ireq->ireq_opt = NULL;
1828 	} else {
1829 		newinet->inet_opt = NULL;
1830 
1831 		if (!req_unhash && found_dup_sk) {
1832 			/* This code path should only be executed in the
1833 			 * syncookie case only
1834 			 */
1835 			bh_unlock_sock(newsk);
1836 			sock_put(newsk);
1837 			newsk = NULL;
1838 		}
1839 	}
1840 	return newsk;
1841 
1842 exit_overflow:
1843 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1844 exit_nonewsk:
1845 	dst_release(dst);
1846 exit:
1847 	tcp_listendrop(sk);
1848 	return NULL;
1849 put_and_exit:
1850 	newinet->inet_opt = NULL;
1851 	inet_csk_prepare_forced_close(newsk);
1852 	tcp_done(newsk);
1853 	goto exit;
1854 }
1855 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1856 
1857 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1858 {
1859 #ifdef CONFIG_SYN_COOKIES
1860 	const struct tcphdr *th = tcp_hdr(skb);
1861 
1862 	if (!th->syn)
1863 		sk = cookie_v4_check(sk, skb);
1864 #endif
1865 	return sk;
1866 }
1867 
1868 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1869 			 struct tcphdr *th, u32 *cookie)
1870 {
1871 	u16 mss = 0;
1872 #ifdef CONFIG_SYN_COOKIES
1873 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1874 				    &tcp_request_sock_ipv4_ops, sk, th);
1875 	if (mss) {
1876 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1877 		tcp_synq_overflow(sk);
1878 	}
1879 #endif
1880 	return mss;
1881 }
1882 
1883 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1884 							   u32));
1885 /* The socket must have it's spinlock held when we get
1886  * here, unless it is a TCP_LISTEN socket.
1887  *
1888  * We have a potential double-lock case here, so even when
1889  * doing backlog processing we use the BH locking scheme.
1890  * This is because we cannot sleep with the original spinlock
1891  * held.
1892  */
1893 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1894 {
1895 	enum skb_drop_reason reason;
1896 	struct sock *rsk;
1897 
1898 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1899 		struct dst_entry *dst;
1900 
1901 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1902 						lockdep_sock_is_held(sk));
1903 
1904 		sock_rps_save_rxhash(sk, skb);
1905 		sk_mark_napi_id(sk, skb);
1906 		if (dst) {
1907 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1908 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1909 					     dst, 0)) {
1910 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1911 				dst_release(dst);
1912 			}
1913 		}
1914 		tcp_rcv_established(sk, skb);
1915 		return 0;
1916 	}
1917 
1918 	if (tcp_checksum_complete(skb))
1919 		goto csum_err;
1920 
1921 	if (sk->sk_state == TCP_LISTEN) {
1922 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1923 
1924 		if (!nsk)
1925 			return 0;
1926 		if (nsk != sk) {
1927 			reason = tcp_child_process(sk, nsk, skb);
1928 			if (reason) {
1929 				rsk = nsk;
1930 				goto reset;
1931 			}
1932 			return 0;
1933 		}
1934 	} else
1935 		sock_rps_save_rxhash(sk, skb);
1936 
1937 	reason = tcp_rcv_state_process(sk, skb);
1938 	if (reason) {
1939 		rsk = sk;
1940 		goto reset;
1941 	}
1942 	return 0;
1943 
1944 reset:
1945 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1946 discard:
1947 	sk_skb_reason_drop(sk, skb, reason);
1948 	/* Be careful here. If this function gets more complicated and
1949 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1950 	 * might be destroyed here. This current version compiles correctly,
1951 	 * but you have been warned.
1952 	 */
1953 	return 0;
1954 
1955 csum_err:
1956 	reason = SKB_DROP_REASON_TCP_CSUM;
1957 	trace_tcp_bad_csum(skb);
1958 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1959 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1960 	goto discard;
1961 }
1962 EXPORT_SYMBOL(tcp_v4_do_rcv);
1963 
1964 int tcp_v4_early_demux(struct sk_buff *skb)
1965 {
1966 	struct net *net = dev_net(skb->dev);
1967 	const struct iphdr *iph;
1968 	const struct tcphdr *th;
1969 	struct sock *sk;
1970 
1971 	if (skb->pkt_type != PACKET_HOST)
1972 		return 0;
1973 
1974 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1975 		return 0;
1976 
1977 	iph = ip_hdr(skb);
1978 	th = tcp_hdr(skb);
1979 
1980 	if (th->doff < sizeof(struct tcphdr) / 4)
1981 		return 0;
1982 
1983 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1984 				       iph->saddr, th->source,
1985 				       iph->daddr, ntohs(th->dest),
1986 				       skb->skb_iif, inet_sdif(skb));
1987 	if (sk) {
1988 		skb->sk = sk;
1989 		skb->destructor = sock_edemux;
1990 		if (sk_fullsock(sk)) {
1991 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1992 
1993 			if (dst)
1994 				dst = dst_check(dst, 0);
1995 			if (dst &&
1996 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1997 				skb_dst_set_noref(skb, dst);
1998 		}
1999 	}
2000 	return 0;
2001 }
2002 
2003 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2004 		     enum skb_drop_reason *reason)
2005 {
2006 	u32 tail_gso_size, tail_gso_segs;
2007 	struct skb_shared_info *shinfo;
2008 	const struct tcphdr *th;
2009 	struct tcphdr *thtail;
2010 	struct sk_buff *tail;
2011 	unsigned int hdrlen;
2012 	bool fragstolen;
2013 	u32 gso_segs;
2014 	u32 gso_size;
2015 	u64 limit;
2016 	int delta;
2017 
2018 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2019 	 * we can fix skb->truesize to its real value to avoid future drops.
2020 	 * This is valid because skb is not yet charged to the socket.
2021 	 * It has been noticed pure SACK packets were sometimes dropped
2022 	 * (if cooked by drivers without copybreak feature).
2023 	 */
2024 	skb_condense(skb);
2025 
2026 	skb_dst_drop(skb);
2027 
2028 	if (unlikely(tcp_checksum_complete(skb))) {
2029 		bh_unlock_sock(sk);
2030 		trace_tcp_bad_csum(skb);
2031 		*reason = SKB_DROP_REASON_TCP_CSUM;
2032 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2033 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2034 		return true;
2035 	}
2036 
2037 	/* Attempt coalescing to last skb in backlog, even if we are
2038 	 * above the limits.
2039 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2040 	 */
2041 	th = (const struct tcphdr *)skb->data;
2042 	hdrlen = th->doff * 4;
2043 
2044 	tail = sk->sk_backlog.tail;
2045 	if (!tail)
2046 		goto no_coalesce;
2047 	thtail = (struct tcphdr *)tail->data;
2048 
2049 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2050 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2051 	    ((TCP_SKB_CB(tail)->tcp_flags |
2052 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2053 	    !((TCP_SKB_CB(tail)->tcp_flags &
2054 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2055 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2056 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2057 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2058 	    thtail->doff != th->doff ||
2059 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2060 		goto no_coalesce;
2061 
2062 	__skb_pull(skb, hdrlen);
2063 
2064 	shinfo = skb_shinfo(skb);
2065 	gso_size = shinfo->gso_size ?: skb->len;
2066 	gso_segs = shinfo->gso_segs ?: 1;
2067 
2068 	shinfo = skb_shinfo(tail);
2069 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2070 	tail_gso_segs = shinfo->gso_segs ?: 1;
2071 
2072 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2073 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2074 
2075 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2076 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2077 			thtail->window = th->window;
2078 		}
2079 
2080 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2081 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2082 		 * is not entered if we append a packet with a FIN.
2083 		 * SYN, RST, URG are not present.
2084 		 * ACK is set on both packets.
2085 		 * PSH : we do not really care in TCP stack,
2086 		 *       at least for 'GRO' packets.
2087 		 */
2088 		thtail->fin |= th->fin;
2089 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2090 
2091 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2092 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2093 			tail->tstamp = skb->tstamp;
2094 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2095 		}
2096 
2097 		/* Not as strict as GRO. We only need to carry mss max value */
2098 		shinfo->gso_size = max(gso_size, tail_gso_size);
2099 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2100 
2101 		sk->sk_backlog.len += delta;
2102 		__NET_INC_STATS(sock_net(sk),
2103 				LINUX_MIB_TCPBACKLOGCOALESCE);
2104 		kfree_skb_partial(skb, fragstolen);
2105 		return false;
2106 	}
2107 	__skb_push(skb, hdrlen);
2108 
2109 no_coalesce:
2110 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2111 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2112 	 * sk_rcvbuf in normal conditions.
2113 	 */
2114 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2115 
2116 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2117 
2118 	/* Only socket owner can try to collapse/prune rx queues
2119 	 * to reduce memory overhead, so add a little headroom here.
2120 	 * Few sockets backlog are possibly concurrently non empty.
2121 	 */
2122 	limit += 64 * 1024;
2123 
2124 	limit = min_t(u64, limit, UINT_MAX);
2125 
2126 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2127 		bh_unlock_sock(sk);
2128 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2129 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2130 		return true;
2131 	}
2132 	return false;
2133 }
2134 EXPORT_SYMBOL(tcp_add_backlog);
2135 
2136 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2137 {
2138 	struct tcphdr *th = (struct tcphdr *)skb->data;
2139 
2140 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2141 }
2142 EXPORT_SYMBOL(tcp_filter);
2143 
2144 static void tcp_v4_restore_cb(struct sk_buff *skb)
2145 {
2146 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2147 		sizeof(struct inet_skb_parm));
2148 }
2149 
2150 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2151 			   const struct tcphdr *th)
2152 {
2153 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2154 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2155 	 */
2156 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2157 		sizeof(struct inet_skb_parm));
2158 	barrier();
2159 
2160 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2161 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2162 				    skb->len - th->doff * 4);
2163 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2164 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2165 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2166 	TCP_SKB_CB(skb)->sacked	 = 0;
2167 	TCP_SKB_CB(skb)->has_rxtstamp =
2168 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2169 }
2170 
2171 /*
2172  *	From tcp_input.c
2173  */
2174 
2175 int tcp_v4_rcv(struct sk_buff *skb)
2176 {
2177 	struct net *net = dev_net(skb->dev);
2178 	enum skb_drop_reason drop_reason;
2179 	int sdif = inet_sdif(skb);
2180 	int dif = inet_iif(skb);
2181 	const struct iphdr *iph;
2182 	const struct tcphdr *th;
2183 	struct sock *sk = NULL;
2184 	bool refcounted;
2185 	int ret;
2186 	u32 isn;
2187 
2188 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2189 	if (skb->pkt_type != PACKET_HOST)
2190 		goto discard_it;
2191 
2192 	/* Count it even if it's bad */
2193 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2194 
2195 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2196 		goto discard_it;
2197 
2198 	th = (const struct tcphdr *)skb->data;
2199 
2200 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2201 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2202 		goto bad_packet;
2203 	}
2204 	if (!pskb_may_pull(skb, th->doff * 4))
2205 		goto discard_it;
2206 
2207 	/* An explanation is required here, I think.
2208 	 * Packet length and doff are validated by header prediction,
2209 	 * provided case of th->doff==0 is eliminated.
2210 	 * So, we defer the checks. */
2211 
2212 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2213 		goto csum_error;
2214 
2215 	th = (const struct tcphdr *)skb->data;
2216 	iph = ip_hdr(skb);
2217 lookup:
2218 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2219 			       skb, __tcp_hdrlen(th), th->source,
2220 			       th->dest, sdif, &refcounted);
2221 	if (!sk)
2222 		goto no_tcp_socket;
2223 
2224 	if (sk->sk_state == TCP_TIME_WAIT)
2225 		goto do_time_wait;
2226 
2227 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2228 		struct request_sock *req = inet_reqsk(sk);
2229 		bool req_stolen = false;
2230 		struct sock *nsk;
2231 
2232 		sk = req->rsk_listener;
2233 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2234 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2235 		else
2236 			drop_reason = tcp_inbound_hash(sk, req, skb,
2237 						       &iph->saddr, &iph->daddr,
2238 						       AF_INET, dif, sdif);
2239 		if (unlikely(drop_reason)) {
2240 			sk_drops_add(sk, skb);
2241 			reqsk_put(req);
2242 			goto discard_it;
2243 		}
2244 		if (tcp_checksum_complete(skb)) {
2245 			reqsk_put(req);
2246 			goto csum_error;
2247 		}
2248 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2249 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2250 			if (!nsk) {
2251 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2252 				goto lookup;
2253 			}
2254 			sk = nsk;
2255 			/* reuseport_migrate_sock() has already held one sk_refcnt
2256 			 * before returning.
2257 			 */
2258 		} else {
2259 			/* We own a reference on the listener, increase it again
2260 			 * as we might lose it too soon.
2261 			 */
2262 			sock_hold(sk);
2263 		}
2264 		refcounted = true;
2265 		nsk = NULL;
2266 		if (!tcp_filter(sk, skb)) {
2267 			th = (const struct tcphdr *)skb->data;
2268 			iph = ip_hdr(skb);
2269 			tcp_v4_fill_cb(skb, iph, th);
2270 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2271 		} else {
2272 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2273 		}
2274 		if (!nsk) {
2275 			reqsk_put(req);
2276 			if (req_stolen) {
2277 				/* Another cpu got exclusive access to req
2278 				 * and created a full blown socket.
2279 				 * Try to feed this packet to this socket
2280 				 * instead of discarding it.
2281 				 */
2282 				tcp_v4_restore_cb(skb);
2283 				sock_put(sk);
2284 				goto lookup;
2285 			}
2286 			goto discard_and_relse;
2287 		}
2288 		nf_reset_ct(skb);
2289 		if (nsk == sk) {
2290 			reqsk_put(req);
2291 			tcp_v4_restore_cb(skb);
2292 		} else {
2293 			drop_reason = tcp_child_process(sk, nsk, skb);
2294 			if (drop_reason) {
2295 				enum sk_rst_reason rst_reason;
2296 
2297 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2298 				tcp_v4_send_reset(nsk, skb, rst_reason);
2299 				goto discard_and_relse;
2300 			}
2301 			sock_put(sk);
2302 			return 0;
2303 		}
2304 	}
2305 
2306 process:
2307 	if (static_branch_unlikely(&ip4_min_ttl)) {
2308 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2309 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2310 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2311 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2312 			goto discard_and_relse;
2313 		}
2314 	}
2315 
2316 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2317 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2318 		goto discard_and_relse;
2319 	}
2320 
2321 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2322 				       AF_INET, dif, sdif);
2323 	if (drop_reason)
2324 		goto discard_and_relse;
2325 
2326 	nf_reset_ct(skb);
2327 
2328 	if (tcp_filter(sk, skb)) {
2329 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2330 		goto discard_and_relse;
2331 	}
2332 	th = (const struct tcphdr *)skb->data;
2333 	iph = ip_hdr(skb);
2334 	tcp_v4_fill_cb(skb, iph, th);
2335 
2336 	skb->dev = NULL;
2337 
2338 	if (sk->sk_state == TCP_LISTEN) {
2339 		ret = tcp_v4_do_rcv(sk, skb);
2340 		goto put_and_return;
2341 	}
2342 
2343 	sk_incoming_cpu_update(sk);
2344 
2345 	bh_lock_sock_nested(sk);
2346 	tcp_segs_in(tcp_sk(sk), skb);
2347 	ret = 0;
2348 	if (!sock_owned_by_user(sk)) {
2349 		ret = tcp_v4_do_rcv(sk, skb);
2350 	} else {
2351 		if (tcp_add_backlog(sk, skb, &drop_reason))
2352 			goto discard_and_relse;
2353 	}
2354 	bh_unlock_sock(sk);
2355 
2356 put_and_return:
2357 	if (refcounted)
2358 		sock_put(sk);
2359 
2360 	return ret;
2361 
2362 no_tcp_socket:
2363 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2364 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2365 		goto discard_it;
2366 
2367 	tcp_v4_fill_cb(skb, iph, th);
2368 
2369 	if (tcp_checksum_complete(skb)) {
2370 csum_error:
2371 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2372 		trace_tcp_bad_csum(skb);
2373 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2374 bad_packet:
2375 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2376 	} else {
2377 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2378 	}
2379 
2380 discard_it:
2381 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2382 	/* Discard frame. */
2383 	sk_skb_reason_drop(sk, skb, drop_reason);
2384 	return 0;
2385 
2386 discard_and_relse:
2387 	sk_drops_add(sk, skb);
2388 	if (refcounted)
2389 		sock_put(sk);
2390 	goto discard_it;
2391 
2392 do_time_wait:
2393 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2394 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2395 		inet_twsk_put(inet_twsk(sk));
2396 		goto discard_it;
2397 	}
2398 
2399 	tcp_v4_fill_cb(skb, iph, th);
2400 
2401 	if (tcp_checksum_complete(skb)) {
2402 		inet_twsk_put(inet_twsk(sk));
2403 		goto csum_error;
2404 	}
2405 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2406 	case TCP_TW_SYN: {
2407 		struct sock *sk2 = inet_lookup_listener(net,
2408 							net->ipv4.tcp_death_row.hashinfo,
2409 							skb, __tcp_hdrlen(th),
2410 							iph->saddr, th->source,
2411 							iph->daddr, th->dest,
2412 							inet_iif(skb),
2413 							sdif);
2414 		if (sk2) {
2415 			inet_twsk_deschedule_put(inet_twsk(sk));
2416 			sk = sk2;
2417 			tcp_v4_restore_cb(skb);
2418 			refcounted = false;
2419 			__this_cpu_write(tcp_tw_isn, isn);
2420 			goto process;
2421 		}
2422 	}
2423 		/* to ACK */
2424 		fallthrough;
2425 	case TCP_TW_ACK:
2426 		tcp_v4_timewait_ack(sk, skb);
2427 		break;
2428 	case TCP_TW_RST:
2429 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2430 		inet_twsk_deschedule_put(inet_twsk(sk));
2431 		goto discard_it;
2432 	case TCP_TW_SUCCESS:;
2433 	}
2434 	goto discard_it;
2435 }
2436 
2437 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2438 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2439 	.twsk_destructor= tcp_twsk_destructor,
2440 };
2441 
2442 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2443 {
2444 	struct dst_entry *dst = skb_dst(skb);
2445 
2446 	if (dst && dst_hold_safe(dst)) {
2447 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2448 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2449 	}
2450 }
2451 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2452 
2453 const struct inet_connection_sock_af_ops ipv4_specific = {
2454 	.queue_xmit	   = ip_queue_xmit,
2455 	.send_check	   = tcp_v4_send_check,
2456 	.rebuild_header	   = inet_sk_rebuild_header,
2457 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2458 	.conn_request	   = tcp_v4_conn_request,
2459 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2460 	.net_header_len	   = sizeof(struct iphdr),
2461 	.setsockopt	   = ip_setsockopt,
2462 	.getsockopt	   = ip_getsockopt,
2463 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2464 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2465 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2466 };
2467 EXPORT_SYMBOL(ipv4_specific);
2468 
2469 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2470 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2471 #ifdef CONFIG_TCP_MD5SIG
2472 	.md5_lookup		= tcp_v4_md5_lookup,
2473 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2474 	.md5_parse		= tcp_v4_parse_md5_keys,
2475 #endif
2476 #ifdef CONFIG_TCP_AO
2477 	.ao_lookup		= tcp_v4_ao_lookup,
2478 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2479 	.ao_parse		= tcp_v4_parse_ao,
2480 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2481 #endif
2482 };
2483 #endif
2484 
2485 /* NOTE: A lot of things set to zero explicitly by call to
2486  *       sk_alloc() so need not be done here.
2487  */
2488 static int tcp_v4_init_sock(struct sock *sk)
2489 {
2490 	struct inet_connection_sock *icsk = inet_csk(sk);
2491 
2492 	tcp_init_sock(sk);
2493 
2494 	icsk->icsk_af_ops = &ipv4_specific;
2495 
2496 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2497 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2498 #endif
2499 
2500 	return 0;
2501 }
2502 
2503 #ifdef CONFIG_TCP_MD5SIG
2504 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2505 {
2506 	struct tcp_md5sig_info *md5sig;
2507 
2508 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2509 	kfree(md5sig);
2510 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2511 	tcp_md5_release_sigpool();
2512 }
2513 #endif
2514 
2515 void tcp_v4_destroy_sock(struct sock *sk)
2516 {
2517 	struct tcp_sock *tp = tcp_sk(sk);
2518 
2519 	trace_tcp_destroy_sock(sk);
2520 
2521 	tcp_clear_xmit_timers(sk);
2522 
2523 	tcp_cleanup_congestion_control(sk);
2524 
2525 	tcp_cleanup_ulp(sk);
2526 
2527 	/* Cleanup up the write buffer. */
2528 	tcp_write_queue_purge(sk);
2529 
2530 	/* Check if we want to disable active TFO */
2531 	tcp_fastopen_active_disable_ofo_check(sk);
2532 
2533 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2534 	skb_rbtree_purge(&tp->out_of_order_queue);
2535 
2536 #ifdef CONFIG_TCP_MD5SIG
2537 	/* Clean up the MD5 key list, if any */
2538 	if (tp->md5sig_info) {
2539 		struct tcp_md5sig_info *md5sig;
2540 
2541 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2542 		tcp_clear_md5_list(sk);
2543 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2544 		rcu_assign_pointer(tp->md5sig_info, NULL);
2545 	}
2546 #endif
2547 	tcp_ao_destroy_sock(sk, false);
2548 
2549 	/* Clean up a referenced TCP bind bucket. */
2550 	if (inet_csk(sk)->icsk_bind_hash)
2551 		inet_put_port(sk);
2552 
2553 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2554 
2555 	/* If socket is aborted during connect operation */
2556 	tcp_free_fastopen_req(tp);
2557 	tcp_fastopen_destroy_cipher(sk);
2558 	tcp_saved_syn_free(tp);
2559 
2560 	sk_sockets_allocated_dec(sk);
2561 }
2562 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2563 
2564 #ifdef CONFIG_PROC_FS
2565 /* Proc filesystem TCP sock list dumping. */
2566 
2567 static unsigned short seq_file_family(const struct seq_file *seq);
2568 
2569 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2570 {
2571 	unsigned short family = seq_file_family(seq);
2572 
2573 	/* AF_UNSPEC is used as a match all */
2574 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2575 		net_eq(sock_net(sk), seq_file_net(seq)));
2576 }
2577 
2578 /* Find a non empty bucket (starting from st->bucket)
2579  * and return the first sk from it.
2580  */
2581 static void *listening_get_first(struct seq_file *seq)
2582 {
2583 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2584 	struct tcp_iter_state *st = seq->private;
2585 
2586 	st->offset = 0;
2587 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2588 		struct inet_listen_hashbucket *ilb2;
2589 		struct hlist_nulls_node *node;
2590 		struct sock *sk;
2591 
2592 		ilb2 = &hinfo->lhash2[st->bucket];
2593 		if (hlist_nulls_empty(&ilb2->nulls_head))
2594 			continue;
2595 
2596 		spin_lock(&ilb2->lock);
2597 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2598 			if (seq_sk_match(seq, sk))
2599 				return sk;
2600 		}
2601 		spin_unlock(&ilb2->lock);
2602 	}
2603 
2604 	return NULL;
2605 }
2606 
2607 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2608  * If "cur" is the last one in the st->bucket,
2609  * call listening_get_first() to return the first sk of the next
2610  * non empty bucket.
2611  */
2612 static void *listening_get_next(struct seq_file *seq, void *cur)
2613 {
2614 	struct tcp_iter_state *st = seq->private;
2615 	struct inet_listen_hashbucket *ilb2;
2616 	struct hlist_nulls_node *node;
2617 	struct inet_hashinfo *hinfo;
2618 	struct sock *sk = cur;
2619 
2620 	++st->num;
2621 	++st->offset;
2622 
2623 	sk = sk_nulls_next(sk);
2624 	sk_nulls_for_each_from(sk, node) {
2625 		if (seq_sk_match(seq, sk))
2626 			return sk;
2627 	}
2628 
2629 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2630 	ilb2 = &hinfo->lhash2[st->bucket];
2631 	spin_unlock(&ilb2->lock);
2632 	++st->bucket;
2633 	return listening_get_first(seq);
2634 }
2635 
2636 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2637 {
2638 	struct tcp_iter_state *st = seq->private;
2639 	void *rc;
2640 
2641 	st->bucket = 0;
2642 	st->offset = 0;
2643 	rc = listening_get_first(seq);
2644 
2645 	while (rc && *pos) {
2646 		rc = listening_get_next(seq, rc);
2647 		--*pos;
2648 	}
2649 	return rc;
2650 }
2651 
2652 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2653 				const struct tcp_iter_state *st)
2654 {
2655 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2656 }
2657 
2658 /*
2659  * Get first established socket starting from bucket given in st->bucket.
2660  * If st->bucket is zero, the very first socket in the hash is returned.
2661  */
2662 static void *established_get_first(struct seq_file *seq)
2663 {
2664 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2665 	struct tcp_iter_state *st = seq->private;
2666 
2667 	st->offset = 0;
2668 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2669 		struct sock *sk;
2670 		struct hlist_nulls_node *node;
2671 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2672 
2673 		cond_resched();
2674 
2675 		/* Lockless fast path for the common case of empty buckets */
2676 		if (empty_bucket(hinfo, st))
2677 			continue;
2678 
2679 		spin_lock_bh(lock);
2680 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2681 			if (seq_sk_match(seq, sk))
2682 				return sk;
2683 		}
2684 		spin_unlock_bh(lock);
2685 	}
2686 
2687 	return NULL;
2688 }
2689 
2690 static void *established_get_next(struct seq_file *seq, void *cur)
2691 {
2692 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2693 	struct tcp_iter_state *st = seq->private;
2694 	struct hlist_nulls_node *node;
2695 	struct sock *sk = cur;
2696 
2697 	++st->num;
2698 	++st->offset;
2699 
2700 	sk = sk_nulls_next(sk);
2701 
2702 	sk_nulls_for_each_from(sk, node) {
2703 		if (seq_sk_match(seq, sk))
2704 			return sk;
2705 	}
2706 
2707 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2708 	++st->bucket;
2709 	return established_get_first(seq);
2710 }
2711 
2712 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2713 {
2714 	struct tcp_iter_state *st = seq->private;
2715 	void *rc;
2716 
2717 	st->bucket = 0;
2718 	rc = established_get_first(seq);
2719 
2720 	while (rc && pos) {
2721 		rc = established_get_next(seq, rc);
2722 		--pos;
2723 	}
2724 	return rc;
2725 }
2726 
2727 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2728 {
2729 	void *rc;
2730 	struct tcp_iter_state *st = seq->private;
2731 
2732 	st->state = TCP_SEQ_STATE_LISTENING;
2733 	rc	  = listening_get_idx(seq, &pos);
2734 
2735 	if (!rc) {
2736 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2737 		rc	  = established_get_idx(seq, pos);
2738 	}
2739 
2740 	return rc;
2741 }
2742 
2743 static void *tcp_seek_last_pos(struct seq_file *seq)
2744 {
2745 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2746 	struct tcp_iter_state *st = seq->private;
2747 	int bucket = st->bucket;
2748 	int offset = st->offset;
2749 	int orig_num = st->num;
2750 	void *rc = NULL;
2751 
2752 	switch (st->state) {
2753 	case TCP_SEQ_STATE_LISTENING:
2754 		if (st->bucket > hinfo->lhash2_mask)
2755 			break;
2756 		rc = listening_get_first(seq);
2757 		while (offset-- && rc && bucket == st->bucket)
2758 			rc = listening_get_next(seq, rc);
2759 		if (rc)
2760 			break;
2761 		st->bucket = 0;
2762 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2763 		fallthrough;
2764 	case TCP_SEQ_STATE_ESTABLISHED:
2765 		if (st->bucket > hinfo->ehash_mask)
2766 			break;
2767 		rc = established_get_first(seq);
2768 		while (offset-- && rc && bucket == st->bucket)
2769 			rc = established_get_next(seq, rc);
2770 	}
2771 
2772 	st->num = orig_num;
2773 
2774 	return rc;
2775 }
2776 
2777 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2778 {
2779 	struct tcp_iter_state *st = seq->private;
2780 	void *rc;
2781 
2782 	if (*pos && *pos == st->last_pos) {
2783 		rc = tcp_seek_last_pos(seq);
2784 		if (rc)
2785 			goto out;
2786 	}
2787 
2788 	st->state = TCP_SEQ_STATE_LISTENING;
2789 	st->num = 0;
2790 	st->bucket = 0;
2791 	st->offset = 0;
2792 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2793 
2794 out:
2795 	st->last_pos = *pos;
2796 	return rc;
2797 }
2798 EXPORT_SYMBOL(tcp_seq_start);
2799 
2800 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2801 {
2802 	struct tcp_iter_state *st = seq->private;
2803 	void *rc = NULL;
2804 
2805 	if (v == SEQ_START_TOKEN) {
2806 		rc = tcp_get_idx(seq, 0);
2807 		goto out;
2808 	}
2809 
2810 	switch (st->state) {
2811 	case TCP_SEQ_STATE_LISTENING:
2812 		rc = listening_get_next(seq, v);
2813 		if (!rc) {
2814 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2815 			st->bucket = 0;
2816 			st->offset = 0;
2817 			rc	  = established_get_first(seq);
2818 		}
2819 		break;
2820 	case TCP_SEQ_STATE_ESTABLISHED:
2821 		rc = established_get_next(seq, v);
2822 		break;
2823 	}
2824 out:
2825 	++*pos;
2826 	st->last_pos = *pos;
2827 	return rc;
2828 }
2829 EXPORT_SYMBOL(tcp_seq_next);
2830 
2831 void tcp_seq_stop(struct seq_file *seq, void *v)
2832 {
2833 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2834 	struct tcp_iter_state *st = seq->private;
2835 
2836 	switch (st->state) {
2837 	case TCP_SEQ_STATE_LISTENING:
2838 		if (v != SEQ_START_TOKEN)
2839 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2840 		break;
2841 	case TCP_SEQ_STATE_ESTABLISHED:
2842 		if (v)
2843 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2844 		break;
2845 	}
2846 }
2847 EXPORT_SYMBOL(tcp_seq_stop);
2848 
2849 static void get_openreq4(const struct request_sock *req,
2850 			 struct seq_file *f, int i)
2851 {
2852 	const struct inet_request_sock *ireq = inet_rsk(req);
2853 	long delta = req->rsk_timer.expires - jiffies;
2854 
2855 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2856 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2857 		i,
2858 		ireq->ir_loc_addr,
2859 		ireq->ir_num,
2860 		ireq->ir_rmt_addr,
2861 		ntohs(ireq->ir_rmt_port),
2862 		TCP_SYN_RECV,
2863 		0, 0, /* could print option size, but that is af dependent. */
2864 		1,    /* timers active (only the expire timer) */
2865 		jiffies_delta_to_clock_t(delta),
2866 		req->num_timeout,
2867 		from_kuid_munged(seq_user_ns(f),
2868 				 sock_i_uid(req->rsk_listener)),
2869 		0,  /* non standard timer */
2870 		0, /* open_requests have no inode */
2871 		0,
2872 		req);
2873 }
2874 
2875 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2876 {
2877 	int timer_active;
2878 	unsigned long timer_expires;
2879 	const struct tcp_sock *tp = tcp_sk(sk);
2880 	const struct inet_connection_sock *icsk = inet_csk(sk);
2881 	const struct inet_sock *inet = inet_sk(sk);
2882 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2883 	__be32 dest = inet->inet_daddr;
2884 	__be32 src = inet->inet_rcv_saddr;
2885 	__u16 destp = ntohs(inet->inet_dport);
2886 	__u16 srcp = ntohs(inet->inet_sport);
2887 	int rx_queue;
2888 	int state;
2889 
2890 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2891 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2892 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2893 		timer_active	= 1;
2894 		timer_expires	= icsk->icsk_timeout;
2895 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2896 		timer_active	= 4;
2897 		timer_expires	= icsk->icsk_timeout;
2898 	} else if (timer_pending(&sk->sk_timer)) {
2899 		timer_active	= 2;
2900 		timer_expires	= sk->sk_timer.expires;
2901 	} else {
2902 		timer_active	= 0;
2903 		timer_expires = jiffies;
2904 	}
2905 
2906 	state = inet_sk_state_load(sk);
2907 	if (state == TCP_LISTEN)
2908 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2909 	else
2910 		/* Because we don't lock the socket,
2911 		 * we might find a transient negative value.
2912 		 */
2913 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2914 				      READ_ONCE(tp->copied_seq), 0);
2915 
2916 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2917 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2918 		i, src, srcp, dest, destp, state,
2919 		READ_ONCE(tp->write_seq) - tp->snd_una,
2920 		rx_queue,
2921 		timer_active,
2922 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2923 		icsk->icsk_retransmits,
2924 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2925 		icsk->icsk_probes_out,
2926 		sock_i_ino(sk),
2927 		refcount_read(&sk->sk_refcnt), sk,
2928 		jiffies_to_clock_t(icsk->icsk_rto),
2929 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2930 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2931 		tcp_snd_cwnd(tp),
2932 		state == TCP_LISTEN ?
2933 		    fastopenq->max_qlen :
2934 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2935 }
2936 
2937 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2938 			       struct seq_file *f, int i)
2939 {
2940 	long delta = tw->tw_timer.expires - jiffies;
2941 	__be32 dest, src;
2942 	__u16 destp, srcp;
2943 
2944 	dest  = tw->tw_daddr;
2945 	src   = tw->tw_rcv_saddr;
2946 	destp = ntohs(tw->tw_dport);
2947 	srcp  = ntohs(tw->tw_sport);
2948 
2949 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2950 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2951 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2952 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2953 		refcount_read(&tw->tw_refcnt), tw);
2954 }
2955 
2956 #define TMPSZ 150
2957 
2958 static int tcp4_seq_show(struct seq_file *seq, void *v)
2959 {
2960 	struct tcp_iter_state *st;
2961 	struct sock *sk = v;
2962 
2963 	seq_setwidth(seq, TMPSZ - 1);
2964 	if (v == SEQ_START_TOKEN) {
2965 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2966 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2967 			   "inode");
2968 		goto out;
2969 	}
2970 	st = seq->private;
2971 
2972 	if (sk->sk_state == TCP_TIME_WAIT)
2973 		get_timewait4_sock(v, seq, st->num);
2974 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2975 		get_openreq4(v, seq, st->num);
2976 	else
2977 		get_tcp4_sock(v, seq, st->num);
2978 out:
2979 	seq_pad(seq, '\n');
2980 	return 0;
2981 }
2982 
2983 #ifdef CONFIG_BPF_SYSCALL
2984 struct bpf_tcp_iter_state {
2985 	struct tcp_iter_state state;
2986 	unsigned int cur_sk;
2987 	unsigned int end_sk;
2988 	unsigned int max_sk;
2989 	struct sock **batch;
2990 	bool st_bucket_done;
2991 };
2992 
2993 struct bpf_iter__tcp {
2994 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2995 	__bpf_md_ptr(struct sock_common *, sk_common);
2996 	uid_t uid __aligned(8);
2997 };
2998 
2999 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3000 			     struct sock_common *sk_common, uid_t uid)
3001 {
3002 	struct bpf_iter__tcp ctx;
3003 
3004 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3005 	ctx.meta = meta;
3006 	ctx.sk_common = sk_common;
3007 	ctx.uid = uid;
3008 	return bpf_iter_run_prog(prog, &ctx);
3009 }
3010 
3011 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3012 {
3013 	while (iter->cur_sk < iter->end_sk)
3014 		sock_gen_put(iter->batch[iter->cur_sk++]);
3015 }
3016 
3017 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3018 				      unsigned int new_batch_sz)
3019 {
3020 	struct sock **new_batch;
3021 
3022 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3023 			     GFP_USER | __GFP_NOWARN);
3024 	if (!new_batch)
3025 		return -ENOMEM;
3026 
3027 	bpf_iter_tcp_put_batch(iter);
3028 	kvfree(iter->batch);
3029 	iter->batch = new_batch;
3030 	iter->max_sk = new_batch_sz;
3031 
3032 	return 0;
3033 }
3034 
3035 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3036 						 struct sock *start_sk)
3037 {
3038 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3039 	struct bpf_tcp_iter_state *iter = seq->private;
3040 	struct tcp_iter_state *st = &iter->state;
3041 	struct hlist_nulls_node *node;
3042 	unsigned int expected = 1;
3043 	struct sock *sk;
3044 
3045 	sock_hold(start_sk);
3046 	iter->batch[iter->end_sk++] = start_sk;
3047 
3048 	sk = sk_nulls_next(start_sk);
3049 	sk_nulls_for_each_from(sk, node) {
3050 		if (seq_sk_match(seq, sk)) {
3051 			if (iter->end_sk < iter->max_sk) {
3052 				sock_hold(sk);
3053 				iter->batch[iter->end_sk++] = sk;
3054 			}
3055 			expected++;
3056 		}
3057 	}
3058 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3059 
3060 	return expected;
3061 }
3062 
3063 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3064 						   struct sock *start_sk)
3065 {
3066 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3067 	struct bpf_tcp_iter_state *iter = seq->private;
3068 	struct tcp_iter_state *st = &iter->state;
3069 	struct hlist_nulls_node *node;
3070 	unsigned int expected = 1;
3071 	struct sock *sk;
3072 
3073 	sock_hold(start_sk);
3074 	iter->batch[iter->end_sk++] = start_sk;
3075 
3076 	sk = sk_nulls_next(start_sk);
3077 	sk_nulls_for_each_from(sk, node) {
3078 		if (seq_sk_match(seq, sk)) {
3079 			if (iter->end_sk < iter->max_sk) {
3080 				sock_hold(sk);
3081 				iter->batch[iter->end_sk++] = sk;
3082 			}
3083 			expected++;
3084 		}
3085 	}
3086 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3087 
3088 	return expected;
3089 }
3090 
3091 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3092 {
3093 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3094 	struct bpf_tcp_iter_state *iter = seq->private;
3095 	struct tcp_iter_state *st = &iter->state;
3096 	unsigned int expected;
3097 	bool resized = false;
3098 	struct sock *sk;
3099 
3100 	/* The st->bucket is done.  Directly advance to the next
3101 	 * bucket instead of having the tcp_seek_last_pos() to skip
3102 	 * one by one in the current bucket and eventually find out
3103 	 * it has to advance to the next bucket.
3104 	 */
3105 	if (iter->st_bucket_done) {
3106 		st->offset = 0;
3107 		st->bucket++;
3108 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3109 		    st->bucket > hinfo->lhash2_mask) {
3110 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3111 			st->bucket = 0;
3112 		}
3113 	}
3114 
3115 again:
3116 	/* Get a new batch */
3117 	iter->cur_sk = 0;
3118 	iter->end_sk = 0;
3119 	iter->st_bucket_done = false;
3120 
3121 	sk = tcp_seek_last_pos(seq);
3122 	if (!sk)
3123 		return NULL; /* Done */
3124 
3125 	if (st->state == TCP_SEQ_STATE_LISTENING)
3126 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3127 	else
3128 		expected = bpf_iter_tcp_established_batch(seq, sk);
3129 
3130 	if (iter->end_sk == expected) {
3131 		iter->st_bucket_done = true;
3132 		return sk;
3133 	}
3134 
3135 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3136 		resized = true;
3137 		goto again;
3138 	}
3139 
3140 	return sk;
3141 }
3142 
3143 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3144 {
3145 	/* bpf iter does not support lseek, so it always
3146 	 * continue from where it was stop()-ped.
3147 	 */
3148 	if (*pos)
3149 		return bpf_iter_tcp_batch(seq);
3150 
3151 	return SEQ_START_TOKEN;
3152 }
3153 
3154 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3155 {
3156 	struct bpf_tcp_iter_state *iter = seq->private;
3157 	struct tcp_iter_state *st = &iter->state;
3158 	struct sock *sk;
3159 
3160 	/* Whenever seq_next() is called, the iter->cur_sk is
3161 	 * done with seq_show(), so advance to the next sk in
3162 	 * the batch.
3163 	 */
3164 	if (iter->cur_sk < iter->end_sk) {
3165 		/* Keeping st->num consistent in tcp_iter_state.
3166 		 * bpf_iter_tcp does not use st->num.
3167 		 * meta.seq_num is used instead.
3168 		 */
3169 		st->num++;
3170 		/* Move st->offset to the next sk in the bucket such that
3171 		 * the future start() will resume at st->offset in
3172 		 * st->bucket.  See tcp_seek_last_pos().
3173 		 */
3174 		st->offset++;
3175 		sock_gen_put(iter->batch[iter->cur_sk++]);
3176 	}
3177 
3178 	if (iter->cur_sk < iter->end_sk)
3179 		sk = iter->batch[iter->cur_sk];
3180 	else
3181 		sk = bpf_iter_tcp_batch(seq);
3182 
3183 	++*pos;
3184 	/* Keeping st->last_pos consistent in tcp_iter_state.
3185 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3186 	 */
3187 	st->last_pos = *pos;
3188 	return sk;
3189 }
3190 
3191 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3192 {
3193 	struct bpf_iter_meta meta;
3194 	struct bpf_prog *prog;
3195 	struct sock *sk = v;
3196 	uid_t uid;
3197 	int ret;
3198 
3199 	if (v == SEQ_START_TOKEN)
3200 		return 0;
3201 
3202 	if (sk_fullsock(sk))
3203 		lock_sock(sk);
3204 
3205 	if (unlikely(sk_unhashed(sk))) {
3206 		ret = SEQ_SKIP;
3207 		goto unlock;
3208 	}
3209 
3210 	if (sk->sk_state == TCP_TIME_WAIT) {
3211 		uid = 0;
3212 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3213 		const struct request_sock *req = v;
3214 
3215 		uid = from_kuid_munged(seq_user_ns(seq),
3216 				       sock_i_uid(req->rsk_listener));
3217 	} else {
3218 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3219 	}
3220 
3221 	meta.seq = seq;
3222 	prog = bpf_iter_get_info(&meta, false);
3223 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3224 
3225 unlock:
3226 	if (sk_fullsock(sk))
3227 		release_sock(sk);
3228 	return ret;
3229 
3230 }
3231 
3232 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3233 {
3234 	struct bpf_tcp_iter_state *iter = seq->private;
3235 	struct bpf_iter_meta meta;
3236 	struct bpf_prog *prog;
3237 
3238 	if (!v) {
3239 		meta.seq = seq;
3240 		prog = bpf_iter_get_info(&meta, true);
3241 		if (prog)
3242 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3243 	}
3244 
3245 	if (iter->cur_sk < iter->end_sk) {
3246 		bpf_iter_tcp_put_batch(iter);
3247 		iter->st_bucket_done = false;
3248 	}
3249 }
3250 
3251 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3252 	.show		= bpf_iter_tcp_seq_show,
3253 	.start		= bpf_iter_tcp_seq_start,
3254 	.next		= bpf_iter_tcp_seq_next,
3255 	.stop		= bpf_iter_tcp_seq_stop,
3256 };
3257 #endif
3258 static unsigned short seq_file_family(const struct seq_file *seq)
3259 {
3260 	const struct tcp_seq_afinfo *afinfo;
3261 
3262 #ifdef CONFIG_BPF_SYSCALL
3263 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3264 	if (seq->op == &bpf_iter_tcp_seq_ops)
3265 		return AF_UNSPEC;
3266 #endif
3267 
3268 	/* Iterated from proc fs */
3269 	afinfo = pde_data(file_inode(seq->file));
3270 	return afinfo->family;
3271 }
3272 
3273 static const struct seq_operations tcp4_seq_ops = {
3274 	.show		= tcp4_seq_show,
3275 	.start		= tcp_seq_start,
3276 	.next		= tcp_seq_next,
3277 	.stop		= tcp_seq_stop,
3278 };
3279 
3280 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3281 	.family		= AF_INET,
3282 };
3283 
3284 static int __net_init tcp4_proc_init_net(struct net *net)
3285 {
3286 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3287 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3288 		return -ENOMEM;
3289 	return 0;
3290 }
3291 
3292 static void __net_exit tcp4_proc_exit_net(struct net *net)
3293 {
3294 	remove_proc_entry("tcp", net->proc_net);
3295 }
3296 
3297 static struct pernet_operations tcp4_net_ops = {
3298 	.init = tcp4_proc_init_net,
3299 	.exit = tcp4_proc_exit_net,
3300 };
3301 
3302 int __init tcp4_proc_init(void)
3303 {
3304 	return register_pernet_subsys(&tcp4_net_ops);
3305 }
3306 
3307 void tcp4_proc_exit(void)
3308 {
3309 	unregister_pernet_subsys(&tcp4_net_ops);
3310 }
3311 #endif /* CONFIG_PROC_FS */
3312 
3313 /* @wake is one when sk_stream_write_space() calls us.
3314  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3315  * This mimics the strategy used in sock_def_write_space().
3316  */
3317 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3318 {
3319 	const struct tcp_sock *tp = tcp_sk(sk);
3320 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3321 			    READ_ONCE(tp->snd_nxt);
3322 
3323 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3324 }
3325 EXPORT_SYMBOL(tcp_stream_memory_free);
3326 
3327 struct proto tcp_prot = {
3328 	.name			= "TCP",
3329 	.owner			= THIS_MODULE,
3330 	.close			= tcp_close,
3331 	.pre_connect		= tcp_v4_pre_connect,
3332 	.connect		= tcp_v4_connect,
3333 	.disconnect		= tcp_disconnect,
3334 	.accept			= inet_csk_accept,
3335 	.ioctl			= tcp_ioctl,
3336 	.init			= tcp_v4_init_sock,
3337 	.destroy		= tcp_v4_destroy_sock,
3338 	.shutdown		= tcp_shutdown,
3339 	.setsockopt		= tcp_setsockopt,
3340 	.getsockopt		= tcp_getsockopt,
3341 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3342 	.keepalive		= tcp_set_keepalive,
3343 	.recvmsg		= tcp_recvmsg,
3344 	.sendmsg		= tcp_sendmsg,
3345 	.splice_eof		= tcp_splice_eof,
3346 	.backlog_rcv		= tcp_v4_do_rcv,
3347 	.release_cb		= tcp_release_cb,
3348 	.hash			= inet_hash,
3349 	.unhash			= inet_unhash,
3350 	.get_port		= inet_csk_get_port,
3351 	.put_port		= inet_put_port,
3352 #ifdef CONFIG_BPF_SYSCALL
3353 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3354 #endif
3355 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3356 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3357 	.stream_memory_free	= tcp_stream_memory_free,
3358 	.sockets_allocated	= &tcp_sockets_allocated,
3359 	.orphan_count		= &tcp_orphan_count,
3360 
3361 	.memory_allocated	= &tcp_memory_allocated,
3362 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3363 
3364 	.memory_pressure	= &tcp_memory_pressure,
3365 	.sysctl_mem		= sysctl_tcp_mem,
3366 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3367 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3368 	.max_header		= MAX_TCP_HEADER,
3369 	.obj_size		= sizeof(struct tcp_sock),
3370 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3371 	.twsk_prot		= &tcp_timewait_sock_ops,
3372 	.rsk_prot		= &tcp_request_sock_ops,
3373 	.h.hashinfo		= NULL,
3374 	.no_autobind		= true,
3375 	.diag_destroy		= tcp_abort,
3376 };
3377 EXPORT_SYMBOL(tcp_prot);
3378 
3379 static void __net_exit tcp_sk_exit(struct net *net)
3380 {
3381 	if (net->ipv4.tcp_congestion_control)
3382 		bpf_module_put(net->ipv4.tcp_congestion_control,
3383 			       net->ipv4.tcp_congestion_control->owner);
3384 }
3385 
3386 static void __net_init tcp_set_hashinfo(struct net *net)
3387 {
3388 	struct inet_hashinfo *hinfo;
3389 	unsigned int ehash_entries;
3390 	struct net *old_net;
3391 
3392 	if (net_eq(net, &init_net))
3393 		goto fallback;
3394 
3395 	old_net = current->nsproxy->net_ns;
3396 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3397 	if (!ehash_entries)
3398 		goto fallback;
3399 
3400 	ehash_entries = roundup_pow_of_two(ehash_entries);
3401 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3402 	if (!hinfo) {
3403 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3404 			"for a netns, fallback to the global one\n",
3405 			ehash_entries);
3406 fallback:
3407 		hinfo = &tcp_hashinfo;
3408 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3409 	}
3410 
3411 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3412 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3413 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3414 }
3415 
3416 static int __net_init tcp_sk_init(struct net *net)
3417 {
3418 	net->ipv4.sysctl_tcp_ecn = 2;
3419 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3420 
3421 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3422 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3423 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3424 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3425 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3426 
3427 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3428 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3429 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3430 
3431 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3432 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3433 	net->ipv4.sysctl_tcp_syncookies = 1;
3434 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3435 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3436 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3437 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3438 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3439 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3440 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3441 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3442 
3443 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3444 	tcp_set_hashinfo(net);
3445 
3446 	net->ipv4.sysctl_tcp_sack = 1;
3447 	net->ipv4.sysctl_tcp_window_scaling = 1;
3448 	net->ipv4.sysctl_tcp_timestamps = 1;
3449 	net->ipv4.sysctl_tcp_early_retrans = 3;
3450 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3451 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3452 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3453 	net->ipv4.sysctl_tcp_max_reordering = 300;
3454 	net->ipv4.sysctl_tcp_dsack = 1;
3455 	net->ipv4.sysctl_tcp_app_win = 31;
3456 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3457 	net->ipv4.sysctl_tcp_frto = 2;
3458 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3459 	/* This limits the percentage of the congestion window which we
3460 	 * will allow a single TSO frame to consume.  Building TSO frames
3461 	 * which are too large can cause TCP streams to be bursty.
3462 	 */
3463 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3464 	/* Default TSQ limit of 16 TSO segments */
3465 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3466 
3467 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3468 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3469 
3470 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3471 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3472 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3473 	net->ipv4.sysctl_tcp_autocorking = 1;
3474 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3475 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3476 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3477 	if (net != &init_net) {
3478 		memcpy(net->ipv4.sysctl_tcp_rmem,
3479 		       init_net.ipv4.sysctl_tcp_rmem,
3480 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3481 		memcpy(net->ipv4.sysctl_tcp_wmem,
3482 		       init_net.ipv4.sysctl_tcp_wmem,
3483 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3484 	}
3485 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3486 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3487 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3488 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3489 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3490 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3491 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3492 
3493 	/* Set default values for PLB */
3494 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3495 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3496 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3497 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3498 	/* Default congestion threshold for PLB to mark a round is 50% */
3499 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3500 
3501 	/* Reno is always built in */
3502 	if (!net_eq(net, &init_net) &&
3503 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3504 			       init_net.ipv4.tcp_congestion_control->owner))
3505 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3506 	else
3507 		net->ipv4.tcp_congestion_control = &tcp_reno;
3508 
3509 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3510 	net->ipv4.sysctl_tcp_shrink_window = 0;
3511 
3512 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3513 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3514 
3515 	return 0;
3516 }
3517 
3518 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3519 {
3520 	struct net *net;
3521 
3522 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3523 	 * and failed setup_net error unwinding path are serialized.
3524 	 *
3525 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3526 	 * net_exit_list, the thread that dismantles a particular twsk must
3527 	 * do so without other thread progressing to refcount_dec_and_test() of
3528 	 * tcp_death_row.tw_refcount.
3529 	 */
3530 	mutex_lock(&tcp_exit_batch_mutex);
3531 
3532 	tcp_twsk_purge(net_exit_list);
3533 
3534 	list_for_each_entry(net, net_exit_list, exit_list) {
3535 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3536 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3537 		tcp_fastopen_ctx_destroy(net);
3538 	}
3539 
3540 	mutex_unlock(&tcp_exit_batch_mutex);
3541 }
3542 
3543 static struct pernet_operations __net_initdata tcp_sk_ops = {
3544        .init	   = tcp_sk_init,
3545        .exit	   = tcp_sk_exit,
3546        .exit_batch = tcp_sk_exit_batch,
3547 };
3548 
3549 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3550 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3551 		     struct sock_common *sk_common, uid_t uid)
3552 
3553 #define INIT_BATCH_SZ 16
3554 
3555 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3556 {
3557 	struct bpf_tcp_iter_state *iter = priv_data;
3558 	int err;
3559 
3560 	err = bpf_iter_init_seq_net(priv_data, aux);
3561 	if (err)
3562 		return err;
3563 
3564 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3565 	if (err) {
3566 		bpf_iter_fini_seq_net(priv_data);
3567 		return err;
3568 	}
3569 
3570 	return 0;
3571 }
3572 
3573 static void bpf_iter_fini_tcp(void *priv_data)
3574 {
3575 	struct bpf_tcp_iter_state *iter = priv_data;
3576 
3577 	bpf_iter_fini_seq_net(priv_data);
3578 	kvfree(iter->batch);
3579 }
3580 
3581 static const struct bpf_iter_seq_info tcp_seq_info = {
3582 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3583 	.init_seq_private	= bpf_iter_init_tcp,
3584 	.fini_seq_private	= bpf_iter_fini_tcp,
3585 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3586 };
3587 
3588 static const struct bpf_func_proto *
3589 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3590 			    const struct bpf_prog *prog)
3591 {
3592 	switch (func_id) {
3593 	case BPF_FUNC_setsockopt:
3594 		return &bpf_sk_setsockopt_proto;
3595 	case BPF_FUNC_getsockopt:
3596 		return &bpf_sk_getsockopt_proto;
3597 	default:
3598 		return NULL;
3599 	}
3600 }
3601 
3602 static struct bpf_iter_reg tcp_reg_info = {
3603 	.target			= "tcp",
3604 	.ctx_arg_info_size	= 1,
3605 	.ctx_arg_info		= {
3606 		{ offsetof(struct bpf_iter__tcp, sk_common),
3607 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3608 	},
3609 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3610 	.seq_info		= &tcp_seq_info,
3611 };
3612 
3613 static void __init bpf_iter_register(void)
3614 {
3615 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3616 	if (bpf_iter_reg_target(&tcp_reg_info))
3617 		pr_warn("Warning: could not register bpf iterator tcp\n");
3618 }
3619 
3620 #endif
3621 
3622 void __init tcp_v4_init(void)
3623 {
3624 	int cpu, res;
3625 
3626 	for_each_possible_cpu(cpu) {
3627 		struct sock *sk;
3628 
3629 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3630 					   IPPROTO_TCP, &init_net);
3631 		if (res)
3632 			panic("Failed to create the TCP control socket.\n");
3633 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3634 
3635 		/* Please enforce IP_DF and IPID==0 for RST and
3636 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3637 		 */
3638 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3639 
3640 		sk->sk_clockid = CLOCK_MONOTONIC;
3641 
3642 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3643 	}
3644 	if (register_pernet_subsys(&tcp_sk_ops))
3645 		panic("Failed to create the TCP control socket.\n");
3646 
3647 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3648 	bpf_iter_register();
3649 #endif
3650 }
3651