xref: /linux/net/ipv4/tcp_ipv4.c (revision ca220141fa8ebae09765a242076b2b77338106b0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/fips.h>
57 #include <linux/jhash.h>
58 #include <linux/init.h>
59 #include <linux/times.h>
60 #include <linux/slab.h>
61 #include <linux/sched.h>
62 #include <linux/sock_diag.h>
63 
64 #include <net/aligned_data.h>
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/tcp_ecn.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/inet_ecn.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/busy_poll.h>
78 #include <net/rstreason.h>
79 #include <net/psp.h>
80 
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86 #include <linux/inetdevice.h>
87 #include <linux/btf_ids.h>
88 #include <linux/skbuff_ref.h>
89 
90 #include <crypto/md5.h>
91 
92 #include <trace/events/tcp.h>
93 
94 #ifdef CONFIG_TCP_MD5SIG
95 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
96 				__be32 daddr, __be32 saddr, const struct tcphdr *th);
97 #endif
98 
99 struct inet_hashinfo tcp_hashinfo;
100 
101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
102 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
103 };
104 
105 static DEFINE_MUTEX(tcp_exit_batch_mutex);
106 
107 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
108 {
109 	return secure_tcp_seq(ip_hdr(skb)->daddr,
110 			      ip_hdr(skb)->saddr,
111 			      tcp_hdr(skb)->dest,
112 			      tcp_hdr(skb)->source);
113 }
114 
115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
116 {
117 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
118 }
119 
120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
121 {
122 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
123 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
124 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
125 	struct tcp_sock *tp = tcp_sk(sk);
126 	int ts_recent_stamp;
127 	u32 reuse_thresh;
128 
129 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
130 		reuse = 0;
131 
132 	if (reuse == 2) {
133 		/* Still does not detect *everything* that goes through
134 		 * lo, since we require a loopback src or dst address
135 		 * or direct binding to 'lo' interface.
136 		 */
137 		bool loopback = false;
138 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
139 			loopback = true;
140 #if IS_ENABLED(CONFIG_IPV6)
141 		if (tw->tw_family == AF_INET6) {
142 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
143 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
144 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
145 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
146 				loopback = true;
147 		} else
148 #endif
149 		{
150 			if (ipv4_is_loopback(tw->tw_daddr) ||
151 			    ipv4_is_loopback(tw->tw_rcv_saddr))
152 				loopback = true;
153 		}
154 		if (!loopback)
155 			reuse = 0;
156 	}
157 
158 	/* With PAWS, it is safe from the viewpoint
159 	   of data integrity. Even without PAWS it is safe provided sequence
160 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161 
162 	   Actually, the idea is close to VJ's one, only timestamp cache is
163 	   held not per host, but per port pair and TW bucket is used as state
164 	   holder.
165 
166 	   If TW bucket has been already destroyed we fall back to VJ's scheme
167 	   and use initial timestamp retrieved from peer table.
168 	 */
169 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
170 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
171 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
172 	if (ts_recent_stamp &&
173 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
174 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
175 		 * and releasing the bucket lock.
176 		 */
177 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
178 			return 0;
179 
180 		/* In case of repair and re-using TIME-WAIT sockets we still
181 		 * want to be sure that it is safe as above but honor the
182 		 * sequence numbers and time stamps set as part of the repair
183 		 * process.
184 		 *
185 		 * Without this check re-using a TIME-WAIT socket with TCP
186 		 * repair would accumulate a -1 on the repair assigned
187 		 * sequence number. The first time it is reused the sequence
188 		 * is -1, the second time -2, etc. This fixes that issue
189 		 * without appearing to create any others.
190 		 */
191 		if (likely(!tp->repair)) {
192 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
193 
194 			if (!seq)
195 				seq = 1;
196 			WRITE_ONCE(tp->write_seq, seq);
197 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
198 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
199 		}
200 
201 		return 1;
202 	}
203 
204 	return 0;
205 }
206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
207 
208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
209 			      int addr_len)
210 {
211 	/* This check is replicated from tcp_v4_connect() and intended to
212 	 * prevent BPF program called below from accessing bytes that are out
213 	 * of the bound specified by user in addr_len.
214 	 */
215 	if (addr_len < sizeof(struct sockaddr_in))
216 		return -EINVAL;
217 
218 	sock_owned_by_me(sk);
219 
220 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
221 }
222 
223 /* This will initiate an outgoing connection. */
224 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
225 {
226 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
227 	struct inet_timewait_death_row *tcp_death_row;
228 	struct inet_sock *inet = inet_sk(sk);
229 	struct tcp_sock *tp = tcp_sk(sk);
230 	struct ip_options_rcu *inet_opt;
231 	struct net *net = sock_net(sk);
232 	__be16 orig_sport, orig_dport;
233 	__be32 daddr, nexthop;
234 	struct flowi4 *fl4;
235 	struct rtable *rt;
236 	int err;
237 
238 	if (addr_len < sizeof(struct sockaddr_in))
239 		return -EINVAL;
240 
241 	if (usin->sin_family != AF_INET)
242 		return -EAFNOSUPPORT;
243 
244 	nexthop = daddr = usin->sin_addr.s_addr;
245 	inet_opt = rcu_dereference_protected(inet->inet_opt,
246 					     lockdep_sock_is_held(sk));
247 	if (inet_opt && inet_opt->opt.srr) {
248 		if (!daddr)
249 			return -EINVAL;
250 		nexthop = inet_opt->opt.faddr;
251 	}
252 
253 	orig_sport = inet->inet_sport;
254 	orig_dport = usin->sin_port;
255 	fl4 = &inet->cork.fl.u.ip4;
256 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
257 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
258 			      orig_dport, sk);
259 	if (IS_ERR(rt)) {
260 		err = PTR_ERR(rt);
261 		if (err == -ENETUNREACH)
262 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
263 		return err;
264 	}
265 
266 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
267 		ip_rt_put(rt);
268 		return -ENETUNREACH;
269 	}
270 
271 	if (!inet_opt || !inet_opt->opt.srr)
272 		daddr = fl4->daddr;
273 
274 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
275 
276 	if (!inet->inet_saddr) {
277 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
278 		if (err) {
279 			ip_rt_put(rt);
280 			return err;
281 		}
282 	} else {
283 		sk_rcv_saddr_set(sk, inet->inet_saddr);
284 	}
285 
286 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
287 		/* Reset inherited state */
288 		tp->rx_opt.ts_recent	   = 0;
289 		tp->rx_opt.ts_recent_stamp = 0;
290 		if (likely(!tp->repair))
291 			WRITE_ONCE(tp->write_seq, 0);
292 	}
293 
294 	inet->inet_dport = usin->sin_port;
295 	sk_daddr_set(sk, daddr);
296 
297 	inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
298 	if (inet_opt)
299 		inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
300 
301 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
302 
303 	/* Socket identity is still unknown (sport may be zero).
304 	 * However we set state to SYN-SENT and not releasing socket
305 	 * lock select source port, enter ourselves into the hash tables and
306 	 * complete initialization after this.
307 	 */
308 	tcp_set_state(sk, TCP_SYN_SENT);
309 	err = inet_hash_connect(tcp_death_row, sk);
310 	if (err)
311 		goto failure;
312 
313 	sk_set_txhash(sk);
314 
315 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
316 			       inet->inet_sport, inet->inet_dport, sk);
317 	if (IS_ERR(rt)) {
318 		err = PTR_ERR(rt);
319 		rt = NULL;
320 		goto failure;
321 	}
322 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
323 	/* OK, now commit destination to socket.  */
324 	sk->sk_gso_type = SKB_GSO_TCPV4;
325 	sk_setup_caps(sk, &rt->dst);
326 	rt = NULL;
327 
328 	if (likely(!tp->repair)) {
329 		if (!tp->write_seq)
330 			WRITE_ONCE(tp->write_seq,
331 				   secure_tcp_seq(inet->inet_saddr,
332 						  inet->inet_daddr,
333 						  inet->inet_sport,
334 						  usin->sin_port));
335 		WRITE_ONCE(tp->tsoffset,
336 			   secure_tcp_ts_off(net, inet->inet_saddr,
337 					     inet->inet_daddr));
338 	}
339 
340 	atomic_set(&inet->inet_id, get_random_u16());
341 
342 	if (tcp_fastopen_defer_connect(sk, &err))
343 		return err;
344 	if (err)
345 		goto failure;
346 
347 	err = tcp_connect(sk);
348 
349 	if (err)
350 		goto failure;
351 
352 	return 0;
353 
354 failure:
355 	/*
356 	 * This unhashes the socket and releases the local port,
357 	 * if necessary.
358 	 */
359 	tcp_set_state(sk, TCP_CLOSE);
360 	inet_bhash2_reset_saddr(sk);
361 	ip_rt_put(rt);
362 	sk->sk_route_caps = 0;
363 	inet->inet_dport = 0;
364 	return err;
365 }
366 EXPORT_IPV6_MOD(tcp_v4_connect);
367 
368 /*
369  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
370  * It can be called through tcp_release_cb() if socket was owned by user
371  * at the time tcp_v4_err() was called to handle ICMP message.
372  */
373 void tcp_v4_mtu_reduced(struct sock *sk)
374 {
375 	struct inet_sock *inet = inet_sk(sk);
376 	struct dst_entry *dst;
377 	u32 mtu, dmtu;
378 
379 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
380 		return;
381 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
382 	dst = inet_csk_update_pmtu(sk, mtu);
383 	if (!dst)
384 		return;
385 
386 	/* Something is about to be wrong... Remember soft error
387 	 * for the case, if this connection will not able to recover.
388 	 */
389 	dmtu = dst4_mtu(dst);
390 	if (mtu < dmtu && ip_dont_fragment(sk, dst))
391 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
392 
393 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
394 	    ip_sk_accept_pmtu(sk) &&
395 	    inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
396 		tcp_sync_mss(sk, dmtu);
397 
398 		/* Resend the TCP packet because it's
399 		 * clear that the old packet has been
400 		 * dropped. This is the new "fast" path mtu
401 		 * discovery.
402 		 */
403 		tcp_simple_retransmit(sk);
404 	} /* else let the usual retransmit timer handle it */
405 }
406 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
407 
408 static void do_redirect(struct sk_buff *skb, struct sock *sk)
409 {
410 	struct dst_entry *dst = __sk_dst_check(sk, 0);
411 
412 	if (dst)
413 		dst->ops->redirect(dst, sk, skb);
414 }
415 
416 
417 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
418 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
419 {
420 	struct request_sock *req = inet_reqsk(sk);
421 	struct net *net = sock_net(sk);
422 
423 	/* ICMPs are not backlogged, hence we cannot get
424 	 * an established socket here.
425 	 */
426 	if (seq != tcp_rsk(req)->snt_isn) {
427 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
428 	} else if (abort) {
429 		/*
430 		 * Still in SYN_RECV, just remove it silently.
431 		 * There is no good way to pass the error to the newly
432 		 * created socket, and POSIX does not want network
433 		 * errors returned from accept().
434 		 */
435 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
436 		tcp_listendrop(req->rsk_listener);
437 	}
438 	reqsk_put(req);
439 }
440 EXPORT_IPV6_MOD(tcp_req_err);
441 
442 /* TCP-LD (RFC 6069) logic */
443 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
444 {
445 	struct inet_connection_sock *icsk = inet_csk(sk);
446 	struct tcp_sock *tp = tcp_sk(sk);
447 	struct sk_buff *skb;
448 	s32 remaining;
449 	u32 delta_us;
450 
451 	if (sock_owned_by_user(sk))
452 		return;
453 
454 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
455 	    !icsk->icsk_backoff)
456 		return;
457 
458 	skb = tcp_rtx_queue_head(sk);
459 	if (WARN_ON_ONCE(!skb))
460 		return;
461 
462 	icsk->icsk_backoff--;
463 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
464 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
465 
466 	tcp_mstamp_refresh(tp);
467 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
468 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
469 
470 	if (remaining > 0) {
471 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
472 	} else {
473 		/* RTO revert clocked out retransmission.
474 		 * Will retransmit now.
475 		 */
476 		tcp_retransmit_timer(sk);
477 	}
478 }
479 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
480 
481 /*
482  * This routine is called by the ICMP module when it gets some
483  * sort of error condition.  If err < 0 then the socket should
484  * be closed and the error returned to the user.  If err > 0
485  * it's just the icmp type << 8 | icmp code.  After adjustment
486  * header points to the first 8 bytes of the tcp header.  We need
487  * to find the appropriate port.
488  *
489  * The locking strategy used here is very "optimistic". When
490  * someone else accesses the socket the ICMP is just dropped
491  * and for some paths there is no check at all.
492  * A more general error queue to queue errors for later handling
493  * is probably better.
494  *
495  */
496 
497 int tcp_v4_err(struct sk_buff *skb, u32 info)
498 {
499 	const struct iphdr *iph = (const struct iphdr *)skb->data;
500 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
501 	struct net *net = dev_net_rcu(skb->dev);
502 	const int type = icmp_hdr(skb)->type;
503 	const int code = icmp_hdr(skb)->code;
504 	struct request_sock *fastopen;
505 	struct tcp_sock *tp;
506 	u32 seq, snd_una;
507 	struct sock *sk;
508 	int err;
509 
510 	sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
511 				       ntohs(th->source), inet_iif(skb), 0);
512 	if (!sk) {
513 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
514 		return -ENOENT;
515 	}
516 	if (sk->sk_state == TCP_TIME_WAIT) {
517 		/* To increase the counter of ignored icmps for TCP-AO */
518 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
519 		inet_twsk_put(inet_twsk(sk));
520 		return 0;
521 	}
522 	seq = ntohl(th->seq);
523 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
524 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
525 				     type == ICMP_TIME_EXCEEDED ||
526 				     (type == ICMP_DEST_UNREACH &&
527 				      (code == ICMP_NET_UNREACH ||
528 				       code == ICMP_HOST_UNREACH)));
529 		return 0;
530 	}
531 
532 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
533 		sock_put(sk);
534 		return 0;
535 	}
536 
537 	bh_lock_sock(sk);
538 	/* If too many ICMPs get dropped on busy
539 	 * servers this needs to be solved differently.
540 	 * We do take care of PMTU discovery (RFC1191) special case :
541 	 * we can receive locally generated ICMP messages while socket is held.
542 	 */
543 	if (sock_owned_by_user(sk)) {
544 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
545 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
546 	}
547 	if (sk->sk_state == TCP_CLOSE)
548 		goto out;
549 
550 	if (static_branch_unlikely(&ip4_min_ttl)) {
551 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
552 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
553 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
554 			goto out;
555 		}
556 	}
557 
558 	tp = tcp_sk(sk);
559 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
560 	fastopen = rcu_dereference(tp->fastopen_rsk);
561 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
562 	if (sk->sk_state != TCP_LISTEN &&
563 	    !between(seq, snd_una, tp->snd_nxt)) {
564 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
565 		goto out;
566 	}
567 
568 	switch (type) {
569 	case ICMP_REDIRECT:
570 		if (!sock_owned_by_user(sk))
571 			do_redirect(skb, sk);
572 		goto out;
573 	case ICMP_SOURCE_QUENCH:
574 		/* Just silently ignore these. */
575 		goto out;
576 	case ICMP_PARAMETERPROB:
577 		err = EPROTO;
578 		break;
579 	case ICMP_DEST_UNREACH:
580 		if (code > NR_ICMP_UNREACH)
581 			goto out;
582 
583 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
584 			/* We are not interested in TCP_LISTEN and open_requests
585 			 * (SYN-ACKs send out by Linux are always <576bytes so
586 			 * they should go through unfragmented).
587 			 */
588 			if (sk->sk_state == TCP_LISTEN)
589 				goto out;
590 
591 			WRITE_ONCE(tp->mtu_info, info);
592 			if (!sock_owned_by_user(sk)) {
593 				tcp_v4_mtu_reduced(sk);
594 			} else {
595 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
596 					sock_hold(sk);
597 			}
598 			goto out;
599 		}
600 
601 		err = icmp_err_convert[code].errno;
602 		/* check if this ICMP message allows revert of backoff.
603 		 * (see RFC 6069)
604 		 */
605 		if (!fastopen &&
606 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
607 			tcp_ld_RTO_revert(sk, seq);
608 		break;
609 	case ICMP_TIME_EXCEEDED:
610 		err = EHOSTUNREACH;
611 		break;
612 	default:
613 		goto out;
614 	}
615 
616 	switch (sk->sk_state) {
617 	case TCP_SYN_SENT:
618 	case TCP_SYN_RECV:
619 		/* Only in fast or simultaneous open. If a fast open socket is
620 		 * already accepted it is treated as a connected one below.
621 		 */
622 		if (fastopen && !fastopen->sk)
623 			break;
624 
625 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
626 
627 		if (!sock_owned_by_user(sk))
628 			tcp_done_with_error(sk, err);
629 		else
630 			WRITE_ONCE(sk->sk_err_soft, err);
631 		goto out;
632 	}
633 
634 	/* If we've already connected we will keep trying
635 	 * until we time out, or the user gives up.
636 	 *
637 	 * rfc1122 4.2.3.9 allows to consider as hard errors
638 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
639 	 * but it is obsoleted by pmtu discovery).
640 	 *
641 	 * Note, that in modern internet, where routing is unreliable
642 	 * and in each dark corner broken firewalls sit, sending random
643 	 * errors ordered by their masters even this two messages finally lose
644 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
645 	 *
646 	 * Now we are in compliance with RFCs.
647 	 *							--ANK (980905)
648 	 */
649 
650 	if (!sock_owned_by_user(sk) &&
651 	    inet_test_bit(RECVERR, sk)) {
652 		WRITE_ONCE(sk->sk_err, err);
653 		sk_error_report(sk);
654 	} else	{ /* Only an error on timeout */
655 		WRITE_ONCE(sk->sk_err_soft, err);
656 	}
657 
658 out:
659 	bh_unlock_sock(sk);
660 	sock_put(sk);
661 	return 0;
662 }
663 
664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
665 {
666 	struct tcphdr *th = tcp_hdr(skb);
667 
668 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
669 	skb->csum_start = skb_transport_header(skb) - skb->head;
670 	skb->csum_offset = offsetof(struct tcphdr, check);
671 }
672 
673 /* This routine computes an IPv4 TCP checksum. */
674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
675 {
676 	const struct inet_sock *inet = inet_sk(sk);
677 
678 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
679 }
680 EXPORT_IPV6_MOD(tcp_v4_send_check);
681 
682 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
683 
684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
685 				 const struct tcp_ao_hdr *aoh,
686 				 struct ip_reply_arg *arg, struct tcphdr *reply,
687 				 __be32 reply_options[REPLY_OPTIONS_LEN])
688 {
689 #ifdef CONFIG_TCP_AO
690 	int sdif = tcp_v4_sdif(skb);
691 	int dif = inet_iif(skb);
692 	int l3index = sdif ? dif : 0;
693 	bool allocated_traffic_key;
694 	struct tcp_ao_key *key;
695 	char *traffic_key;
696 	bool drop = true;
697 	u32 ao_sne = 0;
698 	u8 keyid;
699 
700 	rcu_read_lock();
701 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
702 				 &key, &traffic_key, &allocated_traffic_key,
703 				 &keyid, &ao_sne))
704 		goto out;
705 
706 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
707 				 (aoh->rnext_keyid << 8) | keyid);
708 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
709 	reply->doff = arg->iov[0].iov_len / 4;
710 
711 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
712 			    key, traffic_key,
713 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
714 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
715 			    reply, ao_sne))
716 		goto out;
717 	drop = false;
718 out:
719 	rcu_read_unlock();
720 	if (allocated_traffic_key)
721 		kfree(traffic_key);
722 	return drop;
723 #else
724 	return true;
725 #endif
726 }
727 
728 /*
729  *	This routine will send an RST to the other tcp.
730  *
731  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
732  *		      for reset.
733  *	Answer: if a packet caused RST, it is not for a socket
734  *		existing in our system, if it is matched to a socket,
735  *		it is just duplicate segment or bug in other side's TCP.
736  *		So that we build reply only basing on parameters
737  *		arrived with segment.
738  *	Exception: precedence violation. We do not implement it in any case.
739  */
740 
741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
742 			      enum sk_rst_reason reason)
743 {
744 	const struct tcphdr *th = tcp_hdr(skb);
745 	struct {
746 		struct tcphdr th;
747 		__be32 opt[REPLY_OPTIONS_LEN];
748 	} rep;
749 	const __u8 *md5_hash_location = NULL;
750 	const struct tcp_ao_hdr *aoh;
751 	struct ip_reply_arg arg;
752 #ifdef CONFIG_TCP_MD5SIG
753 	struct tcp_md5sig_key *key = NULL;
754 	unsigned char newhash[16];
755 	struct sock *sk1 = NULL;
756 #endif
757 	u64 transmit_time = 0;
758 	struct sock *ctl_sk;
759 	struct net *net;
760 	u32 txhash = 0;
761 
762 	/* Never send a reset in response to a reset. */
763 	if (th->rst)
764 		return;
765 
766 	/* If sk not NULL, it means we did a successful lookup and incoming
767 	 * route had to be correct. prequeue might have dropped our dst.
768 	 */
769 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
770 		return;
771 
772 	/* Swap the send and the receive. */
773 	memset(&rep, 0, sizeof(rep));
774 	rep.th.dest   = th->source;
775 	rep.th.source = th->dest;
776 	rep.th.doff   = sizeof(struct tcphdr) / 4;
777 	rep.th.rst    = 1;
778 
779 	if (th->ack) {
780 		rep.th.seq = th->ack_seq;
781 	} else {
782 		rep.th.ack = 1;
783 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
784 				       skb->len - (th->doff << 2));
785 	}
786 
787 	memset(&arg, 0, sizeof(arg));
788 	arg.iov[0].iov_base = (unsigned char *)&rep;
789 	arg.iov[0].iov_len  = sizeof(rep.th);
790 
791 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
792 
793 	/* Invalid TCP option size or twice included auth */
794 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
795 		return;
796 
797 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
798 		return;
799 
800 #ifdef CONFIG_TCP_MD5SIG
801 	rcu_read_lock();
802 	if (sk && sk_fullsock(sk)) {
803 		const union tcp_md5_addr *addr;
804 		int l3index;
805 
806 		/* sdif set, means packet ingressed via a device
807 		 * in an L3 domain and inet_iif is set to it.
808 		 */
809 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
810 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
811 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
812 	} else if (md5_hash_location) {
813 		const union tcp_md5_addr *addr;
814 		int sdif = tcp_v4_sdif(skb);
815 		int dif = inet_iif(skb);
816 		int l3index;
817 
818 		/*
819 		 * active side is lost. Try to find listening socket through
820 		 * source port, and then find md5 key through listening socket.
821 		 * we are not loose security here:
822 		 * Incoming packet is checked with md5 hash with finding key,
823 		 * no RST generated if md5 hash doesn't match.
824 		 */
825 		sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
826 					     th->source, ip_hdr(skb)->daddr,
827 					     ntohs(th->source), dif, sdif);
828 		/* don't send rst if it can't find key */
829 		if (!sk1)
830 			goto out;
831 
832 		/* sdif set, means packet ingressed via a device
833 		 * in an L3 domain and dif is set to it.
834 		 */
835 		l3index = sdif ? dif : 0;
836 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
837 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
838 		if (!key)
839 			goto out;
840 
841 		tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
842 		if (memcmp(md5_hash_location, newhash, 16) != 0)
843 			goto out;
844 	}
845 
846 	if (key) {
847 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
848 				   (TCPOPT_NOP << 16) |
849 				   (TCPOPT_MD5SIG << 8) |
850 				   TCPOLEN_MD5SIG);
851 		/* Update length and the length the header thinks exists */
852 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
853 		rep.th.doff = arg.iov[0].iov_len / 4;
854 
855 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
856 				     key, ip_hdr(skb)->saddr,
857 				     ip_hdr(skb)->daddr, &rep.th);
858 	}
859 #endif
860 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
861 	if (rep.opt[0] == 0) {
862 		__be32 mrst = mptcp_reset_option(skb);
863 
864 		if (mrst) {
865 			rep.opt[0] = mrst;
866 			arg.iov[0].iov_len += sizeof(mrst);
867 			rep.th.doff = arg.iov[0].iov_len / 4;
868 		}
869 	}
870 
871 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
872 				      ip_hdr(skb)->saddr, /* XXX */
873 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
874 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
875 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
876 
877 	/* When socket is gone, all binding information is lost.
878 	 * routing might fail in this case. No choice here, if we choose to force
879 	 * input interface, we will misroute in case of asymmetric route.
880 	 */
881 	if (sk)
882 		arg.bound_dev_if = sk->sk_bound_dev_if;
883 
884 	trace_tcp_send_reset(sk, skb, reason);
885 
886 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
887 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
888 
889 	/* ECN bits of TW reset are cleared */
890 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
891 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
894 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
895 
896 	sock_net_set(ctl_sk, net);
897 	if (sk) {
898 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
899 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
900 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
901 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
902 		transmit_time = tcp_transmit_time(sk);
903 		xfrm_sk_clone_policy(ctl_sk, sk);
904 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
905 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
906 	} else {
907 		ctl_sk->sk_mark = 0;
908 		ctl_sk->sk_priority = 0;
909 	}
910 	ip_send_unicast_reply(ctl_sk, sk,
911 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
912 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913 			      &arg, arg.iov[0].iov_len,
914 			      transmit_time, txhash);
915 
916 	xfrm_sk_free_policy(ctl_sk);
917 	sock_net_set(ctl_sk, &init_net);
918 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
919 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
920 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
921 	local_bh_enable();
922 
923 #ifdef CONFIG_TCP_MD5SIG
924 out:
925 	rcu_read_unlock();
926 #endif
927 }
928 
929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
930    outside socket context is ugly, certainly. What can I do?
931  */
932 
933 static void tcp_v4_send_ack(const struct sock *sk,
934 			    struct sk_buff *skb, u32 seq, u32 ack,
935 			    u32 win, u32 tsval, u32 tsecr, int oif,
936 			    struct tcp_key *key,
937 			    int reply_flags, u8 tos, u32 txhash)
938 {
939 	const struct tcphdr *th = tcp_hdr(skb);
940 	struct {
941 		struct tcphdr th;
942 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
943 	} rep;
944 	struct net *net = sock_net(sk);
945 	struct ip_reply_arg arg;
946 	struct sock *ctl_sk;
947 	u64 transmit_time;
948 
949 	memset(&rep.th, 0, sizeof(struct tcphdr));
950 	memset(&arg, 0, sizeof(arg));
951 
952 	arg.iov[0].iov_base = (unsigned char *)&rep;
953 	arg.iov[0].iov_len  = sizeof(rep.th);
954 	if (tsecr) {
955 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
956 				   (TCPOPT_TIMESTAMP << 8) |
957 				   TCPOLEN_TIMESTAMP);
958 		rep.opt[1] = htonl(tsval);
959 		rep.opt[2] = htonl(tsecr);
960 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
961 	}
962 
963 	/* Swap the send and the receive. */
964 	rep.th.dest    = th->source;
965 	rep.th.source  = th->dest;
966 	rep.th.doff    = arg.iov[0].iov_len / 4;
967 	rep.th.seq     = htonl(seq);
968 	rep.th.ack_seq = htonl(ack);
969 	rep.th.ack     = 1;
970 	rep.th.window  = htons(win);
971 
972 #ifdef CONFIG_TCP_MD5SIG
973 	if (tcp_key_is_md5(key)) {
974 		int offset = (tsecr) ? 3 : 0;
975 
976 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
977 					  (TCPOPT_NOP << 16) |
978 					  (TCPOPT_MD5SIG << 8) |
979 					  TCPOLEN_MD5SIG);
980 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
981 		rep.th.doff = arg.iov[0].iov_len/4;
982 
983 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
984 				    key->md5_key, ip_hdr(skb)->saddr,
985 				    ip_hdr(skb)->daddr, &rep.th);
986 	}
987 #endif
988 #ifdef CONFIG_TCP_AO
989 	if (tcp_key_is_ao(key)) {
990 		int offset = (tsecr) ? 3 : 0;
991 
992 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
993 					  (tcp_ao_len(key->ao_key) << 16) |
994 					  (key->ao_key->sndid << 8) |
995 					  key->rcv_next);
996 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
997 		rep.th.doff = arg.iov[0].iov_len / 4;
998 
999 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1000 				key->ao_key, key->traffic_key,
1001 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1002 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1003 				&rep.th, key->sne);
1004 	}
1005 #endif
1006 	arg.flags = reply_flags;
1007 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1008 				      ip_hdr(skb)->saddr, /* XXX */
1009 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1010 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1011 	if (oif)
1012 		arg.bound_dev_if = oif;
1013 	arg.tos = tos;
1014 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1015 	local_bh_disable();
1016 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1017 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1018 	sock_net_set(ctl_sk, net);
1019 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1020 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1021 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1022 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1023 	transmit_time = tcp_transmit_time(sk);
1024 	ip_send_unicast_reply(ctl_sk, sk,
1025 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1026 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1027 			      &arg, arg.iov[0].iov_len,
1028 			      transmit_time, txhash);
1029 
1030 	sock_net_set(ctl_sk, &init_net);
1031 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1032 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1033 	local_bh_enable();
1034 }
1035 
1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1037 				enum tcp_tw_status tw_status)
1038 {
1039 	struct inet_timewait_sock *tw = inet_twsk(sk);
1040 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1041 	struct tcp_key key = {};
1042 	u8 tos = tw->tw_tos;
1043 
1044 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1045 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1046 	 * being placed in a different service queues (Classic rather than L4S)
1047 	 */
1048 	if (tw_status == TCP_TW_ACK_OOW)
1049 		tos &= ~INET_ECN_MASK;
1050 
1051 #ifdef CONFIG_TCP_AO
1052 	struct tcp_ao_info *ao_info;
1053 
1054 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1055 		/* FIXME: the segment to-be-acked is not verified yet */
1056 		ao_info = rcu_dereference(tcptw->ao_info);
1057 		if (ao_info) {
1058 			const struct tcp_ao_hdr *aoh;
1059 
1060 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1061 				inet_twsk_put(tw);
1062 				return;
1063 			}
1064 
1065 			if (aoh)
1066 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1067 								    aoh->rnext_keyid, -1);
1068 		}
1069 	}
1070 	if (key.ao_key) {
1071 		struct tcp_ao_key *rnext_key;
1072 
1073 		key.traffic_key = snd_other_key(key.ao_key);
1074 		key.sne = READ_ONCE(ao_info->snd_sne);
1075 		rnext_key = READ_ONCE(ao_info->rnext_key);
1076 		key.rcv_next = rnext_key->rcvid;
1077 		key.type = TCP_KEY_AO;
1078 #else
1079 	if (0) {
1080 #endif
1081 	} else if (static_branch_tcp_md5()) {
1082 		key.md5_key = tcp_twsk_md5_key(tcptw);
1083 		if (key.md5_key)
1084 			key.type = TCP_KEY_MD5;
1085 	}
1086 
1087 	tcp_v4_send_ack(sk, skb,
1088 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1089 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1090 			tcp_tw_tsval(tcptw),
1091 			READ_ONCE(tcptw->tw_ts_recent),
1092 			tw->tw_bound_dev_if, &key,
1093 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1094 			tos,
1095 			tw->tw_txhash);
1096 
1097 	inet_twsk_put(tw);
1098 }
1099 
1100 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1101 				  struct request_sock *req)
1102 {
1103 	struct tcp_key key = {};
1104 
1105 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1106 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1107 	 */
1108 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1109 					     tcp_sk(sk)->snd_nxt;
1110 
1111 #ifdef CONFIG_TCP_AO
1112 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1113 	    tcp_rsk_used_ao(req)) {
1114 		const union tcp_md5_addr *addr;
1115 		const struct tcp_ao_hdr *aoh;
1116 		int l3index;
1117 
1118 		/* Invalid TCP option size or twice included auth */
1119 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1120 			return;
1121 		if (!aoh)
1122 			return;
1123 
1124 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1125 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1126 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1127 					      aoh->rnext_keyid, -1);
1128 		if (unlikely(!key.ao_key)) {
1129 			/* Send ACK with any matching MKT for the peer */
1130 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1131 			/* Matching key disappeared (user removed the key?)
1132 			 * let the handshake timeout.
1133 			 */
1134 			if (!key.ao_key) {
1135 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1136 						     addr,
1137 						     ntohs(tcp_hdr(skb)->source),
1138 						     &ip_hdr(skb)->daddr,
1139 						     ntohs(tcp_hdr(skb)->dest));
1140 				return;
1141 			}
1142 		}
1143 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1144 		if (!key.traffic_key)
1145 			return;
1146 
1147 		key.type = TCP_KEY_AO;
1148 		key.rcv_next = aoh->keyid;
1149 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1150 #else
1151 	if (0) {
1152 #endif
1153 	} else if (static_branch_tcp_md5()) {
1154 		const union tcp_md5_addr *addr;
1155 		int l3index;
1156 
1157 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1158 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1159 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1160 		if (key.md5_key)
1161 			key.type = TCP_KEY_MD5;
1162 	}
1163 
1164 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1165 	tcp_v4_send_ack(sk, skb, seq,
1166 			tcp_rsk(req)->rcv_nxt,
1167 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1168 			tcp_rsk_tsval(tcp_rsk(req)),
1169 			req->ts_recent,
1170 			0, &key,
1171 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1172 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1173 			READ_ONCE(tcp_rsk(req)->txhash));
1174 	if (tcp_key_is_ao(&key))
1175 		kfree(key.traffic_key);
1176 }
1177 
1178 /*
1179  *	Send a SYN-ACK after having received a SYN.
1180  *	This still operates on a request_sock only, not on a big
1181  *	socket.
1182  */
1183 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1184 			      struct flowi *fl,
1185 			      struct request_sock *req,
1186 			      struct tcp_fastopen_cookie *foc,
1187 			      enum tcp_synack_type synack_type,
1188 			      struct sk_buff *syn_skb)
1189 {
1190 	struct inet_request_sock *ireq = inet_rsk(req);
1191 	struct flowi4 fl4;
1192 	int err = -1;
1193 	struct sk_buff *skb;
1194 	u8 tos;
1195 
1196 	/* First, grab a route. */
1197 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1198 		return -1;
1199 
1200 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1201 
1202 	if (skb) {
1203 		tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1204 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1205 
1206 		tos = READ_ONCE(inet_sk(sk)->tos);
1207 
1208 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1209 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1210 			      (tos & INET_ECN_MASK);
1211 
1212 		if (!INET_ECN_is_capable(tos) &&
1213 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1214 			tos |= INET_ECN_ECT_0;
1215 
1216 		rcu_read_lock();
1217 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1218 					    ireq->ir_rmt_addr,
1219 					    rcu_dereference(ireq->ireq_opt),
1220 					    tos);
1221 		rcu_read_unlock();
1222 		err = net_xmit_eval(err);
1223 	}
1224 
1225 	return err;
1226 }
1227 
1228 /*
1229  *	IPv4 request_sock destructor.
1230  */
1231 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1232 {
1233 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1234 }
1235 
1236 #ifdef CONFIG_TCP_MD5SIG
1237 /*
1238  * RFC2385 MD5 checksumming requires a mapping of
1239  * IP address->MD5 Key.
1240  * We need to maintain these in the sk structure.
1241  */
1242 
1243 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1244 EXPORT_IPV6_MOD(tcp_md5_needed);
1245 
1246 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1247 {
1248 	if (!old)
1249 		return true;
1250 
1251 	/* l3index always overrides non-l3index */
1252 	if (old->l3index && new->l3index == 0)
1253 		return false;
1254 	if (old->l3index == 0 && new->l3index)
1255 		return true;
1256 
1257 	return old->prefixlen < new->prefixlen;
1258 }
1259 
1260 /* Find the Key structure for an address.  */
1261 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1262 					   const union tcp_md5_addr *addr,
1263 					   int family, bool any_l3index)
1264 {
1265 	const struct tcp_sock *tp = tcp_sk(sk);
1266 	struct tcp_md5sig_key *key;
1267 	const struct tcp_md5sig_info *md5sig;
1268 	__be32 mask;
1269 	struct tcp_md5sig_key *best_match = NULL;
1270 	bool match;
1271 
1272 	/* caller either holds rcu_read_lock() or socket lock */
1273 	md5sig = rcu_dereference_check(tp->md5sig_info,
1274 				       lockdep_sock_is_held(sk));
1275 	if (!md5sig)
1276 		return NULL;
1277 
1278 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1279 				 lockdep_sock_is_held(sk)) {
1280 		if (key->family != family)
1281 			continue;
1282 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1283 		    key->l3index != l3index)
1284 			continue;
1285 		if (family == AF_INET) {
1286 			mask = inet_make_mask(key->prefixlen);
1287 			match = (key->addr.a4.s_addr & mask) ==
1288 				(addr->a4.s_addr & mask);
1289 #if IS_ENABLED(CONFIG_IPV6)
1290 		} else if (family == AF_INET6) {
1291 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1292 						  key->prefixlen);
1293 #endif
1294 		} else {
1295 			match = false;
1296 		}
1297 
1298 		if (match && better_md5_match(best_match, key))
1299 			best_match = key;
1300 	}
1301 	return best_match;
1302 }
1303 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1304 
1305 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1306 						      const union tcp_md5_addr *addr,
1307 						      int family, u8 prefixlen,
1308 						      int l3index, u8 flags)
1309 {
1310 	const struct tcp_sock *tp = tcp_sk(sk);
1311 	struct tcp_md5sig_key *key;
1312 	unsigned int size = sizeof(struct in_addr);
1313 	const struct tcp_md5sig_info *md5sig;
1314 
1315 	/* caller either holds rcu_read_lock() or socket lock */
1316 	md5sig = rcu_dereference_check(tp->md5sig_info,
1317 				       lockdep_sock_is_held(sk));
1318 	if (!md5sig)
1319 		return NULL;
1320 #if IS_ENABLED(CONFIG_IPV6)
1321 	if (family == AF_INET6)
1322 		size = sizeof(struct in6_addr);
1323 #endif
1324 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1325 				 lockdep_sock_is_held(sk)) {
1326 		if (key->family != family)
1327 			continue;
1328 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1329 			continue;
1330 		if (key->l3index != l3index)
1331 			continue;
1332 		if (!memcmp(&key->addr, addr, size) &&
1333 		    key->prefixlen == prefixlen)
1334 			return key;
1335 	}
1336 	return NULL;
1337 }
1338 
1339 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1340 					 const struct sock *addr_sk)
1341 {
1342 	const union tcp_md5_addr *addr;
1343 	int l3index;
1344 
1345 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1346 						 addr_sk->sk_bound_dev_if);
1347 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1348 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1349 }
1350 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1351 
1352 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1353 {
1354 	struct tcp_sock *tp = tcp_sk(sk);
1355 	struct tcp_md5sig_info *md5sig;
1356 
1357 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1358 	if (!md5sig)
1359 		return -ENOMEM;
1360 
1361 	sk_gso_disable(sk);
1362 	INIT_HLIST_HEAD(&md5sig->head);
1363 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1364 	return 0;
1365 }
1366 
1367 /* This can be called on a newly created socket, from other files */
1368 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1369 			    int family, u8 prefixlen, int l3index, u8 flags,
1370 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1371 {
1372 	/* Add Key to the list */
1373 	struct tcp_md5sig_key *key;
1374 	struct tcp_sock *tp = tcp_sk(sk);
1375 	struct tcp_md5sig_info *md5sig;
1376 
1377 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1378 	if (key) {
1379 		/* Pre-existing entry - just update that one.
1380 		 * Note that the key might be used concurrently.
1381 		 * data_race() is telling kcsan that we do not care of
1382 		 * key mismatches, since changing MD5 key on live flows
1383 		 * can lead to packet drops.
1384 		 */
1385 		data_race(memcpy(key->key, newkey, newkeylen));
1386 
1387 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1388 		 * Also note that a reader could catch new key->keylen value
1389 		 * but old key->key[], this is the reason we use __GFP_ZERO
1390 		 * at sock_kmalloc() time below these lines.
1391 		 */
1392 		WRITE_ONCE(key->keylen, newkeylen);
1393 
1394 		return 0;
1395 	}
1396 
1397 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1398 					   lockdep_sock_is_held(sk));
1399 
1400 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1401 	if (!key)
1402 		return -ENOMEM;
1403 
1404 	memcpy(key->key, newkey, newkeylen);
1405 	key->keylen = newkeylen;
1406 	key->family = family;
1407 	key->prefixlen = prefixlen;
1408 	key->l3index = l3index;
1409 	key->flags = flags;
1410 	memcpy(&key->addr, addr,
1411 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1412 								 sizeof(struct in_addr));
1413 	hlist_add_head_rcu(&key->node, &md5sig->head);
1414 	return 0;
1415 }
1416 
1417 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1418 		   int family, u8 prefixlen, int l3index, u8 flags,
1419 		   const u8 *newkey, u8 newkeylen)
1420 {
1421 	struct tcp_sock *tp = tcp_sk(sk);
1422 
1423 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1424 		if (fips_enabled) {
1425 			pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1426 			return -EOPNOTSUPP;
1427 		}
1428 
1429 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1430 			return -ENOMEM;
1431 
1432 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1433 			struct tcp_md5sig_info *md5sig;
1434 
1435 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1436 			rcu_assign_pointer(tp->md5sig_info, NULL);
1437 			kfree_rcu(md5sig, rcu);
1438 			return -EUSERS;
1439 		}
1440 	}
1441 
1442 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1443 				newkey, newkeylen, GFP_KERNEL);
1444 }
1445 EXPORT_IPV6_MOD(tcp_md5_do_add);
1446 
1447 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1448 		     int family, u8 prefixlen, int l3index,
1449 		     struct tcp_md5sig_key *key)
1450 {
1451 	struct tcp_sock *tp = tcp_sk(sk);
1452 
1453 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1454 
1455 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1456 			return -ENOMEM;
1457 
1458 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1459 			struct tcp_md5sig_info *md5sig;
1460 
1461 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1462 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1463 			rcu_assign_pointer(tp->md5sig_info, NULL);
1464 			kfree_rcu(md5sig, rcu);
1465 			return -EUSERS;
1466 		}
1467 	}
1468 
1469 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1470 				key->flags, key->key, key->keylen,
1471 				sk_gfp_mask(sk, GFP_ATOMIC));
1472 }
1473 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1474 
1475 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1476 		   u8 prefixlen, int l3index, u8 flags)
1477 {
1478 	struct tcp_md5sig_key *key;
1479 
1480 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1481 	if (!key)
1482 		return -ENOENT;
1483 	hlist_del_rcu(&key->node);
1484 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1485 	kfree_rcu(key, rcu);
1486 	return 0;
1487 }
1488 EXPORT_IPV6_MOD(tcp_md5_do_del);
1489 
1490 void tcp_clear_md5_list(struct sock *sk)
1491 {
1492 	struct tcp_sock *tp = tcp_sk(sk);
1493 	struct tcp_md5sig_key *key;
1494 	struct hlist_node *n;
1495 	struct tcp_md5sig_info *md5sig;
1496 
1497 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1498 
1499 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1500 		hlist_del(&key->node);
1501 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1502 		kfree(key);
1503 	}
1504 }
1505 
1506 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1507 				 sockptr_t optval, int optlen)
1508 {
1509 	struct tcp_md5sig cmd;
1510 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1511 	const union tcp_md5_addr *addr;
1512 	u8 prefixlen = 32;
1513 	int l3index = 0;
1514 	bool l3flag;
1515 	u8 flags;
1516 
1517 	if (optlen < sizeof(cmd))
1518 		return -EINVAL;
1519 
1520 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1521 		return -EFAULT;
1522 
1523 	if (sin->sin_family != AF_INET)
1524 		return -EINVAL;
1525 
1526 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1527 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1528 
1529 	if (optname == TCP_MD5SIG_EXT &&
1530 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1531 		prefixlen = cmd.tcpm_prefixlen;
1532 		if (prefixlen > 32)
1533 			return -EINVAL;
1534 	}
1535 
1536 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1537 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1538 		struct net_device *dev;
1539 
1540 		rcu_read_lock();
1541 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1542 		if (dev && netif_is_l3_master(dev))
1543 			l3index = dev->ifindex;
1544 
1545 		rcu_read_unlock();
1546 
1547 		/* ok to reference set/not set outside of rcu;
1548 		 * right now device MUST be an L3 master
1549 		 */
1550 		if (!dev || !l3index)
1551 			return -EINVAL;
1552 	}
1553 
1554 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1555 
1556 	if (!cmd.tcpm_keylen)
1557 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1558 
1559 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1560 		return -EINVAL;
1561 
1562 	/* Don't allow keys for peers that have a matching TCP-AO key.
1563 	 * See the comment in tcp_ao_add_cmd()
1564 	 */
1565 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1566 		return -EKEYREJECTED;
1567 
1568 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1569 			      cmd.tcpm_key, cmd.tcpm_keylen);
1570 }
1571 
1572 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1573 				    __be32 daddr, __be32 saddr,
1574 				    const struct tcphdr *th, int nbytes)
1575 {
1576 	struct {
1577 		struct tcp4_pseudohdr ip;
1578 		struct tcphdr tcp;
1579 	} h;
1580 
1581 	h.ip.saddr = saddr;
1582 	h.ip.daddr = daddr;
1583 	h.ip.pad = 0;
1584 	h.ip.protocol = IPPROTO_TCP;
1585 	h.ip.len = cpu_to_be16(nbytes);
1586 	h.tcp = *th;
1587 	h.tcp.check = 0;
1588 	md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
1589 }
1590 
1591 static noinline_for_stack void
1592 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1593 		    __be32 daddr, __be32 saddr, const struct tcphdr *th)
1594 {
1595 	struct md5_ctx ctx;
1596 
1597 	md5_init(&ctx);
1598 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
1599 	tcp_md5_hash_key(&ctx, key);
1600 	md5_final(&ctx, md5_hash);
1601 }
1602 
1603 noinline_for_stack void
1604 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1605 		    const struct sock *sk, const struct sk_buff *skb)
1606 {
1607 	const struct tcphdr *th = tcp_hdr(skb);
1608 	__be32 saddr, daddr;
1609 	struct md5_ctx ctx;
1610 
1611 	if (sk) { /* valid for establish/request sockets */
1612 		saddr = sk->sk_rcv_saddr;
1613 		daddr = sk->sk_daddr;
1614 	} else {
1615 		const struct iphdr *iph = ip_hdr(skb);
1616 		saddr = iph->saddr;
1617 		daddr = iph->daddr;
1618 	}
1619 
1620 	md5_init(&ctx);
1621 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
1622 	tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
1623 	tcp_md5_hash_key(&ctx, key);
1624 	md5_final(&ctx, md5_hash);
1625 }
1626 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1627 
1628 #endif
1629 
1630 static void tcp_v4_init_req(struct request_sock *req,
1631 			    const struct sock *sk_listener,
1632 			    struct sk_buff *skb)
1633 {
1634 	struct inet_request_sock *ireq = inet_rsk(req);
1635 	struct net *net = sock_net(sk_listener);
1636 
1637 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1638 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1639 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1640 }
1641 
1642 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1643 					  struct sk_buff *skb,
1644 					  struct flowi *fl,
1645 					  struct request_sock *req,
1646 					  u32 tw_isn)
1647 {
1648 	tcp_v4_init_req(req, sk, skb);
1649 
1650 	if (security_inet_conn_request(sk, skb, req))
1651 		return NULL;
1652 
1653 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1654 }
1655 
1656 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1657 	.family		=	PF_INET,
1658 	.obj_size	=	sizeof(struct tcp_request_sock),
1659 	.send_ack	=	tcp_v4_reqsk_send_ack,
1660 	.destructor	=	tcp_v4_reqsk_destructor,
1661 	.send_reset	=	tcp_v4_send_reset,
1662 };
1663 
1664 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1665 	.mss_clamp	=	TCP_MSS_DEFAULT,
1666 #ifdef CONFIG_TCP_MD5SIG
1667 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1668 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1669 #endif
1670 #ifdef CONFIG_TCP_AO
1671 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1672 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1673 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1674 #endif
1675 #ifdef CONFIG_SYN_COOKIES
1676 	.cookie_init_seq =	cookie_v4_init_sequence,
1677 #endif
1678 	.route_req	=	tcp_v4_route_req,
1679 	.init_seq	=	tcp_v4_init_seq,
1680 	.init_ts_off	=	tcp_v4_init_ts_off,
1681 	.send_synack	=	tcp_v4_send_synack,
1682 };
1683 
1684 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1685 {
1686 	/* Never answer to SYNs send to broadcast or multicast */
1687 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1688 		goto drop;
1689 
1690 	return tcp_conn_request(&tcp_request_sock_ops,
1691 				&tcp_request_sock_ipv4_ops, sk, skb);
1692 
1693 drop:
1694 	tcp_listendrop(sk);
1695 	return 0;
1696 }
1697 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1698 
1699 
1700 /*
1701  * The three way handshake has completed - we got a valid synack -
1702  * now create the new socket.
1703  */
1704 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1705 				  struct request_sock *req,
1706 				  struct dst_entry *dst,
1707 				  struct request_sock *req_unhash,
1708 				  bool *own_req,
1709 				  void (*opt_child_init)(struct sock *newsk,
1710 							 const struct sock *sk))
1711 {
1712 	struct inet_request_sock *ireq;
1713 	bool found_dup_sk = false;
1714 	struct inet_sock *newinet;
1715 	struct tcp_sock *newtp;
1716 	struct sock *newsk;
1717 #ifdef CONFIG_TCP_MD5SIG
1718 	const union tcp_md5_addr *addr;
1719 	struct tcp_md5sig_key *key;
1720 	int l3index;
1721 #endif
1722 	struct ip_options_rcu *inet_opt;
1723 
1724 	if (sk_acceptq_is_full(sk))
1725 		goto exit_overflow;
1726 
1727 	newsk = tcp_create_openreq_child(sk, req, skb);
1728 	if (!newsk)
1729 		goto exit_nonewsk;
1730 
1731 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1732 	inet_sk_rx_dst_set(newsk, skb);
1733 
1734 	newtp		      = tcp_sk(newsk);
1735 	newinet		      = inet_sk(newsk);
1736 	ireq		      = inet_rsk(req);
1737 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1738 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1739 	newinet->mc_index     = inet_iif(skb);
1740 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1741 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1742 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1743 	if (inet_opt)
1744 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1745 	atomic_set(&newinet->inet_id, get_random_u16());
1746 
1747 	/* Set ToS of the new socket based upon the value of incoming SYN.
1748 	 * ECT bits are set later in tcp_init_transfer().
1749 	 */
1750 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1751 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1752 
1753 	if (!dst) {
1754 		dst = inet_csk_route_child_sock(sk, newsk, req);
1755 		if (!dst)
1756 			goto put_and_exit;
1757 	} else {
1758 		/* syncookie case : see end of cookie_v4_check() */
1759 	}
1760 	sk_setup_caps(newsk, dst);
1761 
1762 #if IS_ENABLED(CONFIG_IPV6)
1763 	if (opt_child_init)
1764 		opt_child_init(newsk, sk);
1765 #endif
1766 	tcp_ca_openreq_child(newsk, dst);
1767 
1768 	tcp_sync_mss(newsk, dst4_mtu(dst));
1769 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1770 
1771 	tcp_initialize_rcv_mss(newsk);
1772 
1773 #ifdef CONFIG_TCP_MD5SIG
1774 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1775 	/* Copy over the MD5 key from the original socket */
1776 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1777 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1778 	if (key && !tcp_rsk_used_ao(req)) {
1779 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1780 			goto put_and_exit;
1781 		sk_gso_disable(newsk);
1782 	}
1783 #endif
1784 #ifdef CONFIG_TCP_AO
1785 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1786 		goto put_and_exit; /* OOM, release back memory */
1787 #endif
1788 
1789 	if (__inet_inherit_port(sk, newsk) < 0)
1790 		goto put_and_exit;
1791 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1792 				       &found_dup_sk);
1793 	if (likely(*own_req)) {
1794 		tcp_move_syn(newtp, req);
1795 		ireq->ireq_opt = NULL;
1796 	} else {
1797 		newinet->inet_opt = NULL;
1798 
1799 		if (!req_unhash && found_dup_sk) {
1800 			/* This code path should only be executed in the
1801 			 * syncookie case only
1802 			 */
1803 			bh_unlock_sock(newsk);
1804 			sock_put(newsk);
1805 			newsk = NULL;
1806 		}
1807 	}
1808 	return newsk;
1809 
1810 exit_overflow:
1811 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1812 exit_nonewsk:
1813 	dst_release(dst);
1814 exit:
1815 	tcp_listendrop(sk);
1816 	return NULL;
1817 put_and_exit:
1818 	newinet->inet_opt = NULL;
1819 	inet_csk_prepare_forced_close(newsk);
1820 	tcp_done(newsk);
1821 	goto exit;
1822 }
1823 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1824 
1825 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1826 {
1827 #ifdef CONFIG_SYN_COOKIES
1828 	const struct tcphdr *th = tcp_hdr(skb);
1829 
1830 	if (!th->syn)
1831 		sk = cookie_v4_check(sk, skb);
1832 #endif
1833 	return sk;
1834 }
1835 
1836 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1837 			 struct tcphdr *th, u32 *cookie)
1838 {
1839 	u16 mss = 0;
1840 #ifdef CONFIG_SYN_COOKIES
1841 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1842 				    &tcp_request_sock_ipv4_ops, sk, th);
1843 	if (mss) {
1844 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1845 		tcp_synq_overflow(sk);
1846 	}
1847 #endif
1848 	return mss;
1849 }
1850 
1851 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1852 							   u32));
1853 /* The socket must have it's spinlock held when we get
1854  * here, unless it is a TCP_LISTEN socket.
1855  *
1856  * We have a potential double-lock case here, so even when
1857  * doing backlog processing we use the BH locking scheme.
1858  * This is because we cannot sleep with the original spinlock
1859  * held.
1860  */
1861 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1862 {
1863 	enum skb_drop_reason reason;
1864 	struct sock *rsk;
1865 
1866 	reason = psp_sk_rx_policy_check(sk, skb);
1867 	if (reason)
1868 		goto err_discard;
1869 
1870 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1871 		struct dst_entry *dst;
1872 
1873 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1874 						lockdep_sock_is_held(sk));
1875 
1876 		sock_rps_save_rxhash(sk, skb);
1877 		sk_mark_napi_id(sk, skb);
1878 		if (dst) {
1879 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1880 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1881 					     dst, 0)) {
1882 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1883 				dst_release(dst);
1884 			}
1885 		}
1886 		tcp_rcv_established(sk, skb);
1887 		return 0;
1888 	}
1889 
1890 	if (tcp_checksum_complete(skb))
1891 		goto csum_err;
1892 
1893 	if (sk->sk_state == TCP_LISTEN) {
1894 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1895 
1896 		if (!nsk)
1897 			return 0;
1898 		if (nsk != sk) {
1899 			reason = tcp_child_process(sk, nsk, skb);
1900 			if (reason) {
1901 				rsk = nsk;
1902 				goto reset;
1903 			}
1904 			return 0;
1905 		}
1906 	} else
1907 		sock_rps_save_rxhash(sk, skb);
1908 
1909 	reason = tcp_rcv_state_process(sk, skb);
1910 	if (reason) {
1911 		rsk = sk;
1912 		goto reset;
1913 	}
1914 	return 0;
1915 
1916 reset:
1917 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1918 discard:
1919 	sk_skb_reason_drop(sk, skb, reason);
1920 	/* Be careful here. If this function gets more complicated and
1921 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1922 	 * might be destroyed here. This current version compiles correctly,
1923 	 * but you have been warned.
1924 	 */
1925 	return 0;
1926 
1927 csum_err:
1928 	reason = SKB_DROP_REASON_TCP_CSUM;
1929 	trace_tcp_bad_csum(skb);
1930 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1931 err_discard:
1932 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1933 	goto discard;
1934 }
1935 EXPORT_SYMBOL(tcp_v4_do_rcv);
1936 
1937 int tcp_v4_early_demux(struct sk_buff *skb)
1938 {
1939 	struct net *net = dev_net_rcu(skb->dev);
1940 	const struct iphdr *iph;
1941 	const struct tcphdr *th;
1942 	struct sock *sk;
1943 
1944 	if (skb->pkt_type != PACKET_HOST)
1945 		return 0;
1946 
1947 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1948 		return 0;
1949 
1950 	iph = ip_hdr(skb);
1951 	th = tcp_hdr(skb);
1952 
1953 	if (th->doff < sizeof(struct tcphdr) / 4)
1954 		return 0;
1955 
1956 	sk = __inet_lookup_established(net, iph->saddr, th->source,
1957 				       iph->daddr, ntohs(th->dest),
1958 				       skb->skb_iif, inet_sdif(skb));
1959 	if (sk) {
1960 		skb->sk = sk;
1961 		skb->destructor = sock_edemux;
1962 		if (sk_fullsock(sk)) {
1963 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1964 
1965 			if (dst)
1966 				dst = dst_check(dst, 0);
1967 			if (dst &&
1968 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1969 				skb_dst_set_noref(skb, dst);
1970 		}
1971 	}
1972 	return 0;
1973 }
1974 
1975 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1976 		     enum skb_drop_reason *reason)
1977 {
1978 	u32 tail_gso_size, tail_gso_segs;
1979 	struct skb_shared_info *shinfo;
1980 	const struct tcphdr *th;
1981 	struct tcphdr *thtail;
1982 	struct sk_buff *tail;
1983 	unsigned int hdrlen;
1984 	bool fragstolen;
1985 	u32 gso_segs;
1986 	u32 gso_size;
1987 	u64 limit;
1988 	int delta;
1989 	int err;
1990 
1991 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1992 	 * we can fix skb->truesize to its real value to avoid future drops.
1993 	 * This is valid because skb is not yet charged to the socket.
1994 	 * It has been noticed pure SACK packets were sometimes dropped
1995 	 * (if cooked by drivers without copybreak feature).
1996 	 */
1997 	skb_condense(skb);
1998 
1999 	tcp_cleanup_skb(skb);
2000 
2001 	if (unlikely(tcp_checksum_complete(skb))) {
2002 		bh_unlock_sock(sk);
2003 		trace_tcp_bad_csum(skb);
2004 		*reason = SKB_DROP_REASON_TCP_CSUM;
2005 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2006 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2007 		return true;
2008 	}
2009 
2010 	/* Attempt coalescing to last skb in backlog, even if we are
2011 	 * above the limits.
2012 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2013 	 */
2014 	th = (const struct tcphdr *)skb->data;
2015 	hdrlen = th->doff * 4;
2016 
2017 	tail = sk->sk_backlog.tail;
2018 	if (!tail)
2019 		goto no_coalesce;
2020 	thtail = (struct tcphdr *)tail->data;
2021 
2022 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2023 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2024 	    ((TCP_SKB_CB(tail)->tcp_flags |
2025 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2026 	    !((TCP_SKB_CB(tail)->tcp_flags &
2027 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2028 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2029 	      TCP_SKB_CB(skb)->tcp_flags) &
2030 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2031 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2032 	    thtail->doff != th->doff ||
2033 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
2034 	    /* prior to PSP Rx policy check, retain exact PSP metadata */
2035 	    psp_skb_coalesce_diff(tail, skb))
2036 		goto no_coalesce;
2037 
2038 	__skb_pull(skb, hdrlen);
2039 
2040 	shinfo = skb_shinfo(skb);
2041 	gso_size = shinfo->gso_size ?: skb->len;
2042 	gso_segs = shinfo->gso_segs ?: 1;
2043 
2044 	shinfo = skb_shinfo(tail);
2045 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2046 	tail_gso_segs = shinfo->gso_segs ?: 1;
2047 
2048 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2049 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2050 
2051 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2052 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2053 			thtail->window = th->window;
2054 		}
2055 
2056 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2057 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2058 		 * is not entered if we append a packet with a FIN.
2059 		 * SYN, RST, URG are not present.
2060 		 * ACK is set on both packets.
2061 		 * PSH : we do not really care in TCP stack,
2062 		 *       at least for 'GRO' packets.
2063 		 */
2064 		thtail->fin |= th->fin;
2065 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2066 
2067 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2068 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2069 			tail->tstamp = skb->tstamp;
2070 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2071 		}
2072 
2073 		/* Not as strict as GRO. We only need to carry mss max value */
2074 		shinfo->gso_size = max(gso_size, tail_gso_size);
2075 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2076 
2077 		sk->sk_backlog.len += delta;
2078 		__NET_INC_STATS(sock_net(sk),
2079 				LINUX_MIB_TCPBACKLOGCOALESCE);
2080 		kfree_skb_partial(skb, fragstolen);
2081 		return false;
2082 	}
2083 	__skb_push(skb, hdrlen);
2084 
2085 no_coalesce:
2086 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2087 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2088 	 * sk_rcvbuf in normal conditions.
2089 	 */
2090 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2091 
2092 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2093 
2094 	/* Only socket owner can try to collapse/prune rx queues
2095 	 * to reduce memory overhead, so add a little headroom here.
2096 	 * Few sockets backlog are possibly concurrently non empty.
2097 	 */
2098 	limit += 64 * 1024;
2099 
2100 	limit = min_t(u64, limit, UINT_MAX);
2101 
2102 	err = sk_add_backlog(sk, skb, limit);
2103 	if (unlikely(err)) {
2104 		bh_unlock_sock(sk);
2105 		if (err == -ENOMEM) {
2106 			*reason = SKB_DROP_REASON_PFMEMALLOC;
2107 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2108 		} else {
2109 			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2110 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2111 		}
2112 		return true;
2113 	}
2114 	return false;
2115 }
2116 EXPORT_IPV6_MOD(tcp_add_backlog);
2117 
2118 static void tcp_v4_restore_cb(struct sk_buff *skb)
2119 {
2120 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2121 		sizeof(struct inet_skb_parm));
2122 }
2123 
2124 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2125 			   const struct tcphdr *th)
2126 {
2127 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2128 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2129 	 */
2130 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2131 		sizeof(struct inet_skb_parm));
2132 	barrier();
2133 
2134 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2135 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2136 				    skb->len - th->doff * 4);
2137 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2138 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2139 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2140 	TCP_SKB_CB(skb)->sacked	 = 0;
2141 	TCP_SKB_CB(skb)->has_rxtstamp =
2142 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2143 }
2144 
2145 /*
2146  *	From tcp_input.c
2147  */
2148 
2149 int tcp_v4_rcv(struct sk_buff *skb)
2150 {
2151 	struct net *net = dev_net_rcu(skb->dev);
2152 	enum skb_drop_reason drop_reason;
2153 	enum tcp_tw_status tw_status;
2154 	int sdif = inet_sdif(skb);
2155 	int dif = inet_iif(skb);
2156 	const struct iphdr *iph;
2157 	const struct tcphdr *th;
2158 	struct sock *sk = NULL;
2159 	bool refcounted;
2160 	int ret;
2161 	u32 isn;
2162 
2163 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2164 	if (skb->pkt_type != PACKET_HOST)
2165 		goto discard_it;
2166 
2167 	/* Count it even if it's bad */
2168 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2169 
2170 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2171 		goto discard_it;
2172 
2173 	th = (const struct tcphdr *)skb->data;
2174 
2175 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2176 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2177 		goto bad_packet;
2178 	}
2179 	if (!pskb_may_pull(skb, th->doff * 4))
2180 		goto discard_it;
2181 
2182 	/* An explanation is required here, I think.
2183 	 * Packet length and doff are validated by header prediction,
2184 	 * provided case of th->doff==0 is eliminated.
2185 	 * So, we defer the checks. */
2186 
2187 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2188 		goto csum_error;
2189 
2190 	th = (const struct tcphdr *)skb->data;
2191 	iph = ip_hdr(skb);
2192 lookup:
2193 	sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2194 			       th->dest, sdif, &refcounted);
2195 	if (!sk)
2196 		goto no_tcp_socket;
2197 
2198 	if (sk->sk_state == TCP_TIME_WAIT)
2199 		goto do_time_wait;
2200 
2201 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2202 		struct request_sock *req = inet_reqsk(sk);
2203 		bool req_stolen = false;
2204 		struct sock *nsk;
2205 
2206 		sk = req->rsk_listener;
2207 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2208 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2209 		else
2210 			drop_reason = tcp_inbound_hash(sk, req, skb,
2211 						       &iph->saddr, &iph->daddr,
2212 						       AF_INET, dif, sdif);
2213 		if (unlikely(drop_reason)) {
2214 			sk_drops_skbadd(sk, skb);
2215 			reqsk_put(req);
2216 			goto discard_it;
2217 		}
2218 		if (tcp_checksum_complete(skb)) {
2219 			reqsk_put(req);
2220 			goto csum_error;
2221 		}
2222 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2223 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2224 			if (!nsk) {
2225 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2226 				goto lookup;
2227 			}
2228 			sk = nsk;
2229 			/* reuseport_migrate_sock() has already held one sk_refcnt
2230 			 * before returning.
2231 			 */
2232 		} else {
2233 			/* We own a reference on the listener, increase it again
2234 			 * as we might lose it too soon.
2235 			 */
2236 			sock_hold(sk);
2237 		}
2238 		refcounted = true;
2239 		nsk = NULL;
2240 		if (!tcp_filter(sk, skb, &drop_reason)) {
2241 			th = (const struct tcphdr *)skb->data;
2242 			iph = ip_hdr(skb);
2243 			tcp_v4_fill_cb(skb, iph, th);
2244 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2245 					    &drop_reason);
2246 		}
2247 		if (!nsk) {
2248 			reqsk_put(req);
2249 			if (req_stolen) {
2250 				/* Another cpu got exclusive access to req
2251 				 * and created a full blown socket.
2252 				 * Try to feed this packet to this socket
2253 				 * instead of discarding it.
2254 				 */
2255 				tcp_v4_restore_cb(skb);
2256 				sock_put(sk);
2257 				goto lookup;
2258 			}
2259 			goto discard_and_relse;
2260 		}
2261 		nf_reset_ct(skb);
2262 		if (nsk == sk) {
2263 			reqsk_put(req);
2264 			tcp_v4_restore_cb(skb);
2265 		} else {
2266 			drop_reason = tcp_child_process(sk, nsk, skb);
2267 			if (drop_reason) {
2268 				enum sk_rst_reason rst_reason;
2269 
2270 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2271 				tcp_v4_send_reset(nsk, skb, rst_reason);
2272 				goto discard_and_relse;
2273 			}
2274 			sock_put(sk);
2275 			return 0;
2276 		}
2277 	}
2278 
2279 process:
2280 	if (static_branch_unlikely(&ip4_min_ttl)) {
2281 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2282 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2283 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2284 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2285 			goto discard_and_relse;
2286 		}
2287 	}
2288 
2289 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2290 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2291 		goto discard_and_relse;
2292 	}
2293 
2294 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2295 				       AF_INET, dif, sdif);
2296 	if (drop_reason)
2297 		goto discard_and_relse;
2298 
2299 	nf_reset_ct(skb);
2300 
2301 	if (tcp_filter(sk, skb, &drop_reason))
2302 		goto discard_and_relse;
2303 
2304 	th = (const struct tcphdr *)skb->data;
2305 	iph = ip_hdr(skb);
2306 	tcp_v4_fill_cb(skb, iph, th);
2307 
2308 	skb->dev = NULL;
2309 
2310 	if (sk->sk_state == TCP_LISTEN) {
2311 		ret = tcp_v4_do_rcv(sk, skb);
2312 		goto put_and_return;
2313 	}
2314 
2315 	sk_incoming_cpu_update(sk);
2316 
2317 	bh_lock_sock_nested(sk);
2318 	tcp_segs_in(tcp_sk(sk), skb);
2319 	ret = 0;
2320 	if (!sock_owned_by_user(sk)) {
2321 		ret = tcp_v4_do_rcv(sk, skb);
2322 	} else {
2323 		if (tcp_add_backlog(sk, skb, &drop_reason))
2324 			goto discard_and_relse;
2325 	}
2326 	bh_unlock_sock(sk);
2327 
2328 put_and_return:
2329 	if (refcounted)
2330 		sock_put(sk);
2331 
2332 	return ret;
2333 
2334 no_tcp_socket:
2335 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2336 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2337 		goto discard_it;
2338 
2339 	tcp_v4_fill_cb(skb, iph, th);
2340 
2341 	if (tcp_checksum_complete(skb)) {
2342 csum_error:
2343 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2344 		trace_tcp_bad_csum(skb);
2345 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2346 bad_packet:
2347 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2348 	} else {
2349 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2350 	}
2351 
2352 discard_it:
2353 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2354 	/* Discard frame. */
2355 	sk_skb_reason_drop(sk, skb, drop_reason);
2356 	return 0;
2357 
2358 discard_and_relse:
2359 	sk_drops_skbadd(sk, skb);
2360 	if (refcounted)
2361 		sock_put(sk);
2362 	goto discard_it;
2363 
2364 do_time_wait:
2365 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2366 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2367 		inet_twsk_put(inet_twsk(sk));
2368 		goto discard_it;
2369 	}
2370 
2371 	tcp_v4_fill_cb(skb, iph, th);
2372 
2373 	if (tcp_checksum_complete(skb)) {
2374 		inet_twsk_put(inet_twsk(sk));
2375 		goto csum_error;
2376 	}
2377 
2378 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2379 					       &drop_reason);
2380 	switch (tw_status) {
2381 	case TCP_TW_SYN: {
2382 		struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2383 							iph->saddr, th->source,
2384 							iph->daddr, th->dest,
2385 							inet_iif(skb),
2386 							sdif);
2387 		if (sk2) {
2388 			inet_twsk_deschedule_put(inet_twsk(sk));
2389 			sk = sk2;
2390 			tcp_v4_restore_cb(skb);
2391 			refcounted = false;
2392 			__this_cpu_write(tcp_tw_isn, isn);
2393 			goto process;
2394 		}
2395 
2396 		drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2397 		if (drop_reason)
2398 			break;
2399 	}
2400 		/* to ACK */
2401 		fallthrough;
2402 	case TCP_TW_ACK:
2403 	case TCP_TW_ACK_OOW:
2404 		tcp_v4_timewait_ack(sk, skb, tw_status);
2405 		break;
2406 	case TCP_TW_RST:
2407 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2408 		inet_twsk_deschedule_put(inet_twsk(sk));
2409 		goto discard_it;
2410 	case TCP_TW_SUCCESS:;
2411 	}
2412 	goto discard_it;
2413 }
2414 
2415 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2416 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2417 };
2418 
2419 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2420 {
2421 	struct dst_entry *dst = skb_dst(skb);
2422 
2423 	if (dst && dst_hold_safe(dst)) {
2424 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2425 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2426 	}
2427 }
2428 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2429 
2430 const struct inet_connection_sock_af_ops ipv4_specific = {
2431 	.queue_xmit	   = ip_queue_xmit,
2432 	.send_check	   = tcp_v4_send_check,
2433 	.rebuild_header	   = inet_sk_rebuild_header,
2434 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2435 	.conn_request	   = tcp_v4_conn_request,
2436 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2437 	.net_header_len	   = sizeof(struct iphdr),
2438 	.setsockopt	   = ip_setsockopt,
2439 	.getsockopt	   = ip_getsockopt,
2440 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2441 };
2442 EXPORT_IPV6_MOD(ipv4_specific);
2443 
2444 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2445 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2446 #ifdef CONFIG_TCP_MD5SIG
2447 	.md5_lookup		= tcp_v4_md5_lookup,
2448 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2449 	.md5_parse		= tcp_v4_parse_md5_keys,
2450 #endif
2451 #ifdef CONFIG_TCP_AO
2452 	.ao_lookup		= tcp_v4_ao_lookup,
2453 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2454 	.ao_parse		= tcp_v4_parse_ao,
2455 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2456 #endif
2457 };
2458 
2459 static void tcp4_destruct_sock(struct sock *sk)
2460 {
2461 	tcp_md5_destruct_sock(sk);
2462 	tcp_ao_destroy_sock(sk, false);
2463 	inet_sock_destruct(sk);
2464 }
2465 #endif
2466 
2467 /* NOTE: A lot of things set to zero explicitly by call to
2468  *       sk_alloc() so need not be done here.
2469  */
2470 static int tcp_v4_init_sock(struct sock *sk)
2471 {
2472 	struct inet_connection_sock *icsk = inet_csk(sk);
2473 
2474 	tcp_init_sock(sk);
2475 
2476 	icsk->icsk_af_ops = &ipv4_specific;
2477 
2478 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2479 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2480 	sk->sk_destruct = tcp4_destruct_sock;
2481 #endif
2482 
2483 	return 0;
2484 }
2485 
2486 static void tcp_release_user_frags(struct sock *sk)
2487 {
2488 #ifdef CONFIG_PAGE_POOL
2489 	unsigned long index;
2490 	void *netmem;
2491 
2492 	xa_for_each(&sk->sk_user_frags, index, netmem)
2493 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2494 #endif
2495 }
2496 
2497 void tcp_v4_destroy_sock(struct sock *sk)
2498 {
2499 	struct tcp_sock *tp = tcp_sk(sk);
2500 
2501 	tcp_release_user_frags(sk);
2502 
2503 	xa_destroy(&sk->sk_user_frags);
2504 
2505 	trace_tcp_destroy_sock(sk);
2506 
2507 	tcp_clear_xmit_timers(sk);
2508 
2509 	tcp_cleanup_congestion_control(sk);
2510 
2511 	tcp_cleanup_ulp(sk);
2512 
2513 	/* Cleanup up the write buffer. */
2514 	tcp_write_queue_purge(sk);
2515 
2516 	/* Check if we want to disable active TFO */
2517 	tcp_fastopen_active_disable_ofo_check(sk);
2518 
2519 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2520 	skb_rbtree_purge(&tp->out_of_order_queue);
2521 
2522 	/* Clean up a referenced TCP bind bucket. */
2523 	if (inet_csk(sk)->icsk_bind_hash)
2524 		inet_put_port(sk);
2525 
2526 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2527 
2528 	/* If socket is aborted during connect operation */
2529 	tcp_free_fastopen_req(tp);
2530 	tcp_fastopen_destroy_cipher(sk);
2531 	tcp_saved_syn_free(tp);
2532 
2533 	sk_sockets_allocated_dec(sk);
2534 }
2535 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2536 
2537 #ifdef CONFIG_PROC_FS
2538 /* Proc filesystem TCP sock list dumping. */
2539 
2540 static unsigned short seq_file_family(const struct seq_file *seq);
2541 
2542 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2543 {
2544 	unsigned short family = seq_file_family(seq);
2545 
2546 	/* AF_UNSPEC is used as a match all */
2547 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2548 		net_eq(sock_net(sk), seq_file_net(seq)));
2549 }
2550 
2551 /* Find a non empty bucket (starting from st->bucket)
2552  * and return the first sk from it.
2553  */
2554 static void *listening_get_first(struct seq_file *seq)
2555 {
2556 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2557 	struct tcp_iter_state *st = seq->private;
2558 
2559 	st->offset = 0;
2560 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2561 		struct inet_listen_hashbucket *ilb2;
2562 		struct hlist_nulls_node *node;
2563 		struct sock *sk;
2564 
2565 		ilb2 = &hinfo->lhash2[st->bucket];
2566 		if (hlist_nulls_empty(&ilb2->nulls_head))
2567 			continue;
2568 
2569 		spin_lock(&ilb2->lock);
2570 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2571 			if (seq_sk_match(seq, sk))
2572 				return sk;
2573 		}
2574 		spin_unlock(&ilb2->lock);
2575 	}
2576 
2577 	return NULL;
2578 }
2579 
2580 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2581  * If "cur" is the last one in the st->bucket,
2582  * call listening_get_first() to return the first sk of the next
2583  * non empty bucket.
2584  */
2585 static void *listening_get_next(struct seq_file *seq, void *cur)
2586 {
2587 	struct tcp_iter_state *st = seq->private;
2588 	struct inet_listen_hashbucket *ilb2;
2589 	struct hlist_nulls_node *node;
2590 	struct inet_hashinfo *hinfo;
2591 	struct sock *sk = cur;
2592 
2593 	++st->num;
2594 	++st->offset;
2595 
2596 	sk = sk_nulls_next(sk);
2597 	sk_nulls_for_each_from(sk, node) {
2598 		if (seq_sk_match(seq, sk))
2599 			return sk;
2600 	}
2601 
2602 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2603 	ilb2 = &hinfo->lhash2[st->bucket];
2604 	spin_unlock(&ilb2->lock);
2605 	++st->bucket;
2606 	return listening_get_first(seq);
2607 }
2608 
2609 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2610 {
2611 	struct tcp_iter_state *st = seq->private;
2612 	void *rc;
2613 
2614 	st->bucket = 0;
2615 	st->offset = 0;
2616 	rc = listening_get_first(seq);
2617 
2618 	while (rc && *pos) {
2619 		rc = listening_get_next(seq, rc);
2620 		--*pos;
2621 	}
2622 	return rc;
2623 }
2624 
2625 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2626 				const struct tcp_iter_state *st)
2627 {
2628 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2629 }
2630 
2631 /*
2632  * Get first established socket starting from bucket given in st->bucket.
2633  * If st->bucket is zero, the very first socket in the hash is returned.
2634  */
2635 static void *established_get_first(struct seq_file *seq)
2636 {
2637 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2638 	struct tcp_iter_state *st = seq->private;
2639 
2640 	st->offset = 0;
2641 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2642 		struct sock *sk;
2643 		struct hlist_nulls_node *node;
2644 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2645 
2646 		cond_resched();
2647 
2648 		/* Lockless fast path for the common case of empty buckets */
2649 		if (empty_bucket(hinfo, st))
2650 			continue;
2651 
2652 		spin_lock_bh(lock);
2653 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2654 			if (seq_sk_match(seq, sk))
2655 				return sk;
2656 		}
2657 		spin_unlock_bh(lock);
2658 	}
2659 
2660 	return NULL;
2661 }
2662 
2663 static void *established_get_next(struct seq_file *seq, void *cur)
2664 {
2665 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2666 	struct tcp_iter_state *st = seq->private;
2667 	struct hlist_nulls_node *node;
2668 	struct sock *sk = cur;
2669 
2670 	++st->num;
2671 	++st->offset;
2672 
2673 	sk = sk_nulls_next(sk);
2674 
2675 	sk_nulls_for_each_from(sk, node) {
2676 		if (seq_sk_match(seq, sk))
2677 			return sk;
2678 	}
2679 
2680 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2681 	++st->bucket;
2682 	return established_get_first(seq);
2683 }
2684 
2685 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2686 {
2687 	struct tcp_iter_state *st = seq->private;
2688 	void *rc;
2689 
2690 	st->bucket = 0;
2691 	rc = established_get_first(seq);
2692 
2693 	while (rc && pos) {
2694 		rc = established_get_next(seq, rc);
2695 		--pos;
2696 	}
2697 	return rc;
2698 }
2699 
2700 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2701 {
2702 	void *rc;
2703 	struct tcp_iter_state *st = seq->private;
2704 
2705 	st->state = TCP_SEQ_STATE_LISTENING;
2706 	rc	  = listening_get_idx(seq, &pos);
2707 
2708 	if (!rc) {
2709 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2710 		rc	  = established_get_idx(seq, pos);
2711 	}
2712 
2713 	return rc;
2714 }
2715 
2716 static void *tcp_seek_last_pos(struct seq_file *seq)
2717 {
2718 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2719 	struct tcp_iter_state *st = seq->private;
2720 	int bucket = st->bucket;
2721 	int offset = st->offset;
2722 	int orig_num = st->num;
2723 	void *rc = NULL;
2724 
2725 	switch (st->state) {
2726 	case TCP_SEQ_STATE_LISTENING:
2727 		if (st->bucket > hinfo->lhash2_mask)
2728 			break;
2729 		rc = listening_get_first(seq);
2730 		while (offset-- && rc && bucket == st->bucket)
2731 			rc = listening_get_next(seq, rc);
2732 		if (rc)
2733 			break;
2734 		st->bucket = 0;
2735 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2736 		fallthrough;
2737 	case TCP_SEQ_STATE_ESTABLISHED:
2738 		if (st->bucket > hinfo->ehash_mask)
2739 			break;
2740 		rc = established_get_first(seq);
2741 		while (offset-- && rc && bucket == st->bucket)
2742 			rc = established_get_next(seq, rc);
2743 	}
2744 
2745 	st->num = orig_num;
2746 
2747 	return rc;
2748 }
2749 
2750 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2751 {
2752 	struct tcp_iter_state *st = seq->private;
2753 	void *rc;
2754 
2755 	if (*pos && *pos == st->last_pos) {
2756 		rc = tcp_seek_last_pos(seq);
2757 		if (rc)
2758 			goto out;
2759 	}
2760 
2761 	st->state = TCP_SEQ_STATE_LISTENING;
2762 	st->num = 0;
2763 	st->bucket = 0;
2764 	st->offset = 0;
2765 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2766 
2767 out:
2768 	st->last_pos = *pos;
2769 	return rc;
2770 }
2771 EXPORT_IPV6_MOD(tcp_seq_start);
2772 
2773 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2774 {
2775 	struct tcp_iter_state *st = seq->private;
2776 	void *rc = NULL;
2777 
2778 	if (v == SEQ_START_TOKEN) {
2779 		rc = tcp_get_idx(seq, 0);
2780 		goto out;
2781 	}
2782 
2783 	switch (st->state) {
2784 	case TCP_SEQ_STATE_LISTENING:
2785 		rc = listening_get_next(seq, v);
2786 		if (!rc) {
2787 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2788 			st->bucket = 0;
2789 			st->offset = 0;
2790 			rc	  = established_get_first(seq);
2791 		}
2792 		break;
2793 	case TCP_SEQ_STATE_ESTABLISHED:
2794 		rc = established_get_next(seq, v);
2795 		break;
2796 	}
2797 out:
2798 	++*pos;
2799 	st->last_pos = *pos;
2800 	return rc;
2801 }
2802 EXPORT_IPV6_MOD(tcp_seq_next);
2803 
2804 void tcp_seq_stop(struct seq_file *seq, void *v)
2805 {
2806 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2807 	struct tcp_iter_state *st = seq->private;
2808 
2809 	switch (st->state) {
2810 	case TCP_SEQ_STATE_LISTENING:
2811 		if (v != SEQ_START_TOKEN)
2812 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2813 		break;
2814 	case TCP_SEQ_STATE_ESTABLISHED:
2815 		if (v)
2816 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2817 		break;
2818 	}
2819 }
2820 EXPORT_IPV6_MOD(tcp_seq_stop);
2821 
2822 static void get_openreq4(const struct request_sock *req,
2823 			 struct seq_file *f, int i)
2824 {
2825 	const struct inet_request_sock *ireq = inet_rsk(req);
2826 	long delta = req->rsk_timer.expires - jiffies;
2827 
2828 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2829 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2830 		i,
2831 		ireq->ir_loc_addr,
2832 		ireq->ir_num,
2833 		ireq->ir_rmt_addr,
2834 		ntohs(ireq->ir_rmt_port),
2835 		TCP_SYN_RECV,
2836 		0, 0, /* could print option size, but that is af dependent. */
2837 		1,    /* timers active (only the expire timer) */
2838 		jiffies_delta_to_clock_t(delta),
2839 		req->num_timeout,
2840 		from_kuid_munged(seq_user_ns(f),
2841 				 sk_uid(req->rsk_listener)),
2842 		0,  /* non standard timer */
2843 		0, /* open_requests have no inode */
2844 		0,
2845 		req);
2846 }
2847 
2848 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2849 {
2850 	int timer_active;
2851 	unsigned long timer_expires;
2852 	const struct tcp_sock *tp = tcp_sk(sk);
2853 	const struct inet_connection_sock *icsk = inet_csk(sk);
2854 	const struct inet_sock *inet = inet_sk(sk);
2855 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2856 	__be32 dest = inet->inet_daddr;
2857 	__be32 src = inet->inet_rcv_saddr;
2858 	__u16 destp = ntohs(inet->inet_dport);
2859 	__u16 srcp = ntohs(inet->inet_sport);
2860 	u8 icsk_pending;
2861 	int rx_queue;
2862 	int state;
2863 
2864 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2865 	if (icsk_pending == ICSK_TIME_RETRANS ||
2866 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2867 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2868 		timer_active	= 1;
2869 		timer_expires	= tcp_timeout_expires(sk);
2870 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2871 		timer_active	= 4;
2872 		timer_expires	= tcp_timeout_expires(sk);
2873 	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
2874 		timer_active	= 2;
2875 		timer_expires	= icsk->icsk_keepalive_timer.expires;
2876 	} else {
2877 		timer_active	= 0;
2878 		timer_expires = jiffies;
2879 	}
2880 
2881 	state = inet_sk_state_load(sk);
2882 	if (state == TCP_LISTEN)
2883 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2884 	else
2885 		/* Because we don't lock the socket,
2886 		 * we might find a transient negative value.
2887 		 */
2888 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2889 				      READ_ONCE(tp->copied_seq), 0);
2890 
2891 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2892 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2893 		i, src, srcp, dest, destp, state,
2894 		READ_ONCE(tp->write_seq) - tp->snd_una,
2895 		rx_queue,
2896 		timer_active,
2897 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2898 		READ_ONCE(icsk->icsk_retransmits),
2899 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2900 		READ_ONCE(icsk->icsk_probes_out),
2901 		sock_i_ino(sk),
2902 		refcount_read(&sk->sk_refcnt), sk,
2903 		jiffies_to_clock_t(icsk->icsk_rto),
2904 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2905 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2906 		tcp_snd_cwnd(tp),
2907 		state == TCP_LISTEN ?
2908 		    fastopenq->max_qlen :
2909 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2910 }
2911 
2912 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2913 			       struct seq_file *f, int i)
2914 {
2915 	long delta = tw->tw_timer.expires - jiffies;
2916 	__be32 dest, src;
2917 	__u16 destp, srcp;
2918 
2919 	dest  = tw->tw_daddr;
2920 	src   = tw->tw_rcv_saddr;
2921 	destp = ntohs(tw->tw_dport);
2922 	srcp  = ntohs(tw->tw_sport);
2923 
2924 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2925 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2926 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2927 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2928 		refcount_read(&tw->tw_refcnt), tw);
2929 }
2930 
2931 #define TMPSZ 150
2932 
2933 static int tcp4_seq_show(struct seq_file *seq, void *v)
2934 {
2935 	struct tcp_iter_state *st;
2936 	struct sock *sk = v;
2937 
2938 	seq_setwidth(seq, TMPSZ - 1);
2939 	if (v == SEQ_START_TOKEN) {
2940 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2941 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2942 			   "inode");
2943 		goto out;
2944 	}
2945 	st = seq->private;
2946 
2947 	if (sk->sk_state == TCP_TIME_WAIT)
2948 		get_timewait4_sock(v, seq, st->num);
2949 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2950 		get_openreq4(v, seq, st->num);
2951 	else
2952 		get_tcp4_sock(v, seq, st->num);
2953 out:
2954 	seq_pad(seq, '\n');
2955 	return 0;
2956 }
2957 
2958 #ifdef CONFIG_BPF_SYSCALL
2959 union bpf_tcp_iter_batch_item {
2960 	struct sock *sk;
2961 	__u64 cookie;
2962 };
2963 
2964 struct bpf_tcp_iter_state {
2965 	struct tcp_iter_state state;
2966 	unsigned int cur_sk;
2967 	unsigned int end_sk;
2968 	unsigned int max_sk;
2969 	union bpf_tcp_iter_batch_item *batch;
2970 };
2971 
2972 struct bpf_iter__tcp {
2973 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2974 	__bpf_md_ptr(struct sock_common *, sk_common);
2975 	uid_t uid __aligned(8);
2976 };
2977 
2978 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2979 			     struct sock_common *sk_common, uid_t uid)
2980 {
2981 	struct bpf_iter__tcp ctx;
2982 
2983 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2984 	ctx.meta = meta;
2985 	ctx.sk_common = sk_common;
2986 	ctx.uid = uid;
2987 	return bpf_iter_run_prog(prog, &ctx);
2988 }
2989 
2990 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2991 {
2992 	union bpf_tcp_iter_batch_item *item;
2993 	unsigned int cur_sk = iter->cur_sk;
2994 	__u64 cookie;
2995 
2996 	/* Remember the cookies of the sockets we haven't seen yet, so we can
2997 	 * pick up where we left off next time around.
2998 	 */
2999 	while (cur_sk < iter->end_sk) {
3000 		item = &iter->batch[cur_sk++];
3001 		cookie = sock_gen_cookie(item->sk);
3002 		sock_gen_put(item->sk);
3003 		item->cookie = cookie;
3004 	}
3005 }
3006 
3007 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3008 				      unsigned int new_batch_sz, gfp_t flags)
3009 {
3010 	union bpf_tcp_iter_batch_item *new_batch;
3011 
3012 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3013 			     flags | __GFP_NOWARN);
3014 	if (!new_batch)
3015 		return -ENOMEM;
3016 
3017 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3018 	kvfree(iter->batch);
3019 	iter->batch = new_batch;
3020 	iter->max_sk = new_batch_sz;
3021 
3022 	return 0;
3023 }
3024 
3025 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3026 					       union bpf_tcp_iter_batch_item *cookies,
3027 					       int n_cookies)
3028 {
3029 	struct hlist_nulls_node *node;
3030 	struct sock *sk;
3031 	int i;
3032 
3033 	for (i = 0; i < n_cookies; i++) {
3034 		sk = first_sk;
3035 		sk_nulls_for_each_from(sk, node)
3036 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3037 				return sk;
3038 	}
3039 
3040 	return NULL;
3041 }
3042 
3043 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3044 {
3045 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3046 	struct bpf_tcp_iter_state *iter = seq->private;
3047 	struct tcp_iter_state *st = &iter->state;
3048 	unsigned int find_cookie = iter->cur_sk;
3049 	unsigned int end_cookie = iter->end_sk;
3050 	int resume_bucket = st->bucket;
3051 	struct sock *sk;
3052 
3053 	if (end_cookie && find_cookie == end_cookie)
3054 		++st->bucket;
3055 
3056 	sk = listening_get_first(seq);
3057 	iter->cur_sk = 0;
3058 	iter->end_sk = 0;
3059 
3060 	if (sk && st->bucket == resume_bucket && end_cookie) {
3061 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3062 						end_cookie - find_cookie);
3063 		if (!sk) {
3064 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
3065 			++st->bucket;
3066 			sk = listening_get_first(seq);
3067 		}
3068 	}
3069 
3070 	return sk;
3071 }
3072 
3073 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3074 {
3075 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3076 	struct bpf_tcp_iter_state *iter = seq->private;
3077 	struct tcp_iter_state *st = &iter->state;
3078 	unsigned int find_cookie = iter->cur_sk;
3079 	unsigned int end_cookie = iter->end_sk;
3080 	int resume_bucket = st->bucket;
3081 	struct sock *sk;
3082 
3083 	if (end_cookie && find_cookie == end_cookie)
3084 		++st->bucket;
3085 
3086 	sk = established_get_first(seq);
3087 	iter->cur_sk = 0;
3088 	iter->end_sk = 0;
3089 
3090 	if (sk && st->bucket == resume_bucket && end_cookie) {
3091 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3092 						end_cookie - find_cookie);
3093 		if (!sk) {
3094 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3095 			++st->bucket;
3096 			sk = established_get_first(seq);
3097 		}
3098 	}
3099 
3100 	return sk;
3101 }
3102 
3103 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3104 {
3105 	struct bpf_tcp_iter_state *iter = seq->private;
3106 	struct tcp_iter_state *st = &iter->state;
3107 	struct sock *sk = NULL;
3108 
3109 	switch (st->state) {
3110 	case TCP_SEQ_STATE_LISTENING:
3111 		sk = bpf_iter_tcp_resume_listening(seq);
3112 		if (sk)
3113 			break;
3114 		st->bucket = 0;
3115 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3116 		fallthrough;
3117 	case TCP_SEQ_STATE_ESTABLISHED:
3118 		sk = bpf_iter_tcp_resume_established(seq);
3119 		break;
3120 	}
3121 
3122 	return sk;
3123 }
3124 
3125 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3126 						 struct sock **start_sk)
3127 {
3128 	struct bpf_tcp_iter_state *iter = seq->private;
3129 	struct hlist_nulls_node *node;
3130 	unsigned int expected = 1;
3131 	struct sock *sk;
3132 
3133 	sock_hold(*start_sk);
3134 	iter->batch[iter->end_sk++].sk = *start_sk;
3135 
3136 	sk = sk_nulls_next(*start_sk);
3137 	*start_sk = NULL;
3138 	sk_nulls_for_each_from(sk, node) {
3139 		if (seq_sk_match(seq, sk)) {
3140 			if (iter->end_sk < iter->max_sk) {
3141 				sock_hold(sk);
3142 				iter->batch[iter->end_sk++].sk = sk;
3143 			} else if (!*start_sk) {
3144 				/* Remember where we left off. */
3145 				*start_sk = sk;
3146 			}
3147 			expected++;
3148 		}
3149 	}
3150 
3151 	return expected;
3152 }
3153 
3154 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3155 						   struct sock **start_sk)
3156 {
3157 	struct bpf_tcp_iter_state *iter = seq->private;
3158 	struct hlist_nulls_node *node;
3159 	unsigned int expected = 1;
3160 	struct sock *sk;
3161 
3162 	sock_hold(*start_sk);
3163 	iter->batch[iter->end_sk++].sk = *start_sk;
3164 
3165 	sk = sk_nulls_next(*start_sk);
3166 	*start_sk = NULL;
3167 	sk_nulls_for_each_from(sk, node) {
3168 		if (seq_sk_match(seq, sk)) {
3169 			if (iter->end_sk < iter->max_sk) {
3170 				sock_hold(sk);
3171 				iter->batch[iter->end_sk++].sk = sk;
3172 			} else if (!*start_sk) {
3173 				/* Remember where we left off. */
3174 				*start_sk = sk;
3175 			}
3176 			expected++;
3177 		}
3178 	}
3179 
3180 	return expected;
3181 }
3182 
3183 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3184 					struct sock **start_sk)
3185 {
3186 	struct bpf_tcp_iter_state *iter = seq->private;
3187 	struct tcp_iter_state *st = &iter->state;
3188 
3189 	if (st->state == TCP_SEQ_STATE_LISTENING)
3190 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3191 	else
3192 		return bpf_iter_tcp_established_batch(seq, start_sk);
3193 }
3194 
3195 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3196 {
3197 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3198 	struct bpf_tcp_iter_state *iter = seq->private;
3199 	struct tcp_iter_state *st = &iter->state;
3200 
3201 	if (st->state == TCP_SEQ_STATE_LISTENING)
3202 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3203 	else
3204 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3205 }
3206 
3207 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3208 {
3209 	struct bpf_tcp_iter_state *iter = seq->private;
3210 	unsigned int expected;
3211 	struct sock *sk;
3212 	int err;
3213 
3214 	sk = bpf_iter_tcp_resume(seq);
3215 	if (!sk)
3216 		return NULL; /* Done */
3217 
3218 	expected = bpf_iter_fill_batch(seq, &sk);
3219 	if (likely(iter->end_sk == expected))
3220 		goto done;
3221 
3222 	/* Batch size was too small. */
3223 	bpf_iter_tcp_unlock_bucket(seq);
3224 	bpf_iter_tcp_put_batch(iter);
3225 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3226 					 GFP_USER);
3227 	if (err)
3228 		return ERR_PTR(err);
3229 
3230 	sk = bpf_iter_tcp_resume(seq);
3231 	if (!sk)
3232 		return NULL; /* Done */
3233 
3234 	expected = bpf_iter_fill_batch(seq, &sk);
3235 	if (likely(iter->end_sk == expected))
3236 		goto done;
3237 
3238 	/* Batch size was still too small. Hold onto the lock while we try
3239 	 * again with a larger batch to make sure the current bucket's size
3240 	 * does not change in the meantime.
3241 	 */
3242 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3243 	if (err) {
3244 		bpf_iter_tcp_unlock_bucket(seq);
3245 		return ERR_PTR(err);
3246 	}
3247 
3248 	expected = bpf_iter_fill_batch(seq, &sk);
3249 	WARN_ON_ONCE(iter->end_sk != expected);
3250 done:
3251 	bpf_iter_tcp_unlock_bucket(seq);
3252 	return iter->batch[0].sk;
3253 }
3254 
3255 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3256 {
3257 	/* bpf iter does not support lseek, so it always
3258 	 * continue from where it was stop()-ped.
3259 	 */
3260 	if (*pos)
3261 		return bpf_iter_tcp_batch(seq);
3262 
3263 	return SEQ_START_TOKEN;
3264 }
3265 
3266 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3267 {
3268 	struct bpf_tcp_iter_state *iter = seq->private;
3269 	struct tcp_iter_state *st = &iter->state;
3270 	struct sock *sk;
3271 
3272 	/* Whenever seq_next() is called, the iter->cur_sk is
3273 	 * done with seq_show(), so advance to the next sk in
3274 	 * the batch.
3275 	 */
3276 	if (iter->cur_sk < iter->end_sk) {
3277 		/* Keeping st->num consistent in tcp_iter_state.
3278 		 * bpf_iter_tcp does not use st->num.
3279 		 * meta.seq_num is used instead.
3280 		 */
3281 		st->num++;
3282 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3283 	}
3284 
3285 	if (iter->cur_sk < iter->end_sk)
3286 		sk = iter->batch[iter->cur_sk].sk;
3287 	else
3288 		sk = bpf_iter_tcp_batch(seq);
3289 
3290 	++*pos;
3291 	/* Keeping st->last_pos consistent in tcp_iter_state.
3292 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3293 	 */
3294 	st->last_pos = *pos;
3295 	return sk;
3296 }
3297 
3298 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3299 {
3300 	struct bpf_iter_meta meta;
3301 	struct bpf_prog *prog;
3302 	struct sock *sk = v;
3303 	uid_t uid;
3304 	int ret;
3305 
3306 	if (v == SEQ_START_TOKEN)
3307 		return 0;
3308 
3309 	if (sk_fullsock(sk))
3310 		lock_sock(sk);
3311 
3312 	if (unlikely(sk_unhashed(sk))) {
3313 		ret = SEQ_SKIP;
3314 		goto unlock;
3315 	}
3316 
3317 	if (sk->sk_state == TCP_TIME_WAIT) {
3318 		uid = 0;
3319 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3320 		const struct request_sock *req = v;
3321 
3322 		uid = from_kuid_munged(seq_user_ns(seq),
3323 				       sk_uid(req->rsk_listener));
3324 	} else {
3325 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3326 	}
3327 
3328 	meta.seq = seq;
3329 	prog = bpf_iter_get_info(&meta, false);
3330 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3331 
3332 unlock:
3333 	if (sk_fullsock(sk))
3334 		release_sock(sk);
3335 	return ret;
3336 
3337 }
3338 
3339 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3340 {
3341 	struct bpf_tcp_iter_state *iter = seq->private;
3342 	struct bpf_iter_meta meta;
3343 	struct bpf_prog *prog;
3344 
3345 	if (!v) {
3346 		meta.seq = seq;
3347 		prog = bpf_iter_get_info(&meta, true);
3348 		if (prog)
3349 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3350 	}
3351 
3352 	if (iter->cur_sk < iter->end_sk)
3353 		bpf_iter_tcp_put_batch(iter);
3354 }
3355 
3356 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3357 	.show		= bpf_iter_tcp_seq_show,
3358 	.start		= bpf_iter_tcp_seq_start,
3359 	.next		= bpf_iter_tcp_seq_next,
3360 	.stop		= bpf_iter_tcp_seq_stop,
3361 };
3362 #endif
3363 static unsigned short seq_file_family(const struct seq_file *seq)
3364 {
3365 	const struct tcp_seq_afinfo *afinfo;
3366 
3367 #ifdef CONFIG_BPF_SYSCALL
3368 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3369 	if (seq->op == &bpf_iter_tcp_seq_ops)
3370 		return AF_UNSPEC;
3371 #endif
3372 
3373 	/* Iterated from proc fs */
3374 	afinfo = pde_data(file_inode(seq->file));
3375 	return afinfo->family;
3376 }
3377 
3378 static const struct seq_operations tcp4_seq_ops = {
3379 	.show		= tcp4_seq_show,
3380 	.start		= tcp_seq_start,
3381 	.next		= tcp_seq_next,
3382 	.stop		= tcp_seq_stop,
3383 };
3384 
3385 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3386 	.family		= AF_INET,
3387 };
3388 
3389 static int __net_init tcp4_proc_init_net(struct net *net)
3390 {
3391 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3392 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3393 		return -ENOMEM;
3394 	return 0;
3395 }
3396 
3397 static void __net_exit tcp4_proc_exit_net(struct net *net)
3398 {
3399 	remove_proc_entry("tcp", net->proc_net);
3400 }
3401 
3402 static struct pernet_operations tcp4_net_ops = {
3403 	.init = tcp4_proc_init_net,
3404 	.exit = tcp4_proc_exit_net,
3405 };
3406 
3407 int __init tcp4_proc_init(void)
3408 {
3409 	return register_pernet_subsys(&tcp4_net_ops);
3410 }
3411 
3412 void tcp4_proc_exit(void)
3413 {
3414 	unregister_pernet_subsys(&tcp4_net_ops);
3415 }
3416 #endif /* CONFIG_PROC_FS */
3417 
3418 struct proto tcp_prot = {
3419 	.name			= "TCP",
3420 	.owner			= THIS_MODULE,
3421 	.close			= tcp_close,
3422 	.pre_connect		= tcp_v4_pre_connect,
3423 	.connect		= tcp_v4_connect,
3424 	.disconnect		= tcp_disconnect,
3425 	.accept			= inet_csk_accept,
3426 	.ioctl			= tcp_ioctl,
3427 	.init			= tcp_v4_init_sock,
3428 	.destroy		= tcp_v4_destroy_sock,
3429 	.shutdown		= tcp_shutdown,
3430 	.setsockopt		= tcp_setsockopt,
3431 	.getsockopt		= tcp_getsockopt,
3432 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3433 	.keepalive		= tcp_set_keepalive,
3434 	.recvmsg		= tcp_recvmsg,
3435 	.sendmsg		= tcp_sendmsg,
3436 	.splice_eof		= tcp_splice_eof,
3437 	.backlog_rcv		= tcp_v4_do_rcv,
3438 	.release_cb		= tcp_release_cb,
3439 	.hash			= inet_hash,
3440 	.unhash			= inet_unhash,
3441 	.get_port		= inet_csk_get_port,
3442 	.put_port		= inet_put_port,
3443 #ifdef CONFIG_BPF_SYSCALL
3444 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3445 #endif
3446 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3447 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3448 	.stream_memory_free	= tcp_stream_memory_free,
3449 	.sockets_allocated	= &tcp_sockets_allocated,
3450 
3451 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3452 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3453 
3454 	.memory_pressure	= &tcp_memory_pressure,
3455 	.sysctl_mem		= sysctl_tcp_mem,
3456 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3457 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3458 	.max_header		= MAX_TCP_HEADER,
3459 	.obj_size		= sizeof(struct tcp_sock),
3460 	.freeptr_offset		= offsetof(struct tcp_sock,
3461 					   inet_conn.icsk_inet.sk.sk_freeptr),
3462 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3463 	.twsk_prot		= &tcp_timewait_sock_ops,
3464 	.rsk_prot		= &tcp_request_sock_ops,
3465 	.h.hashinfo		= NULL,
3466 	.no_autobind		= true,
3467 	.diag_destroy		= tcp_abort,
3468 };
3469 EXPORT_SYMBOL(tcp_prot);
3470 
3471 static void __net_exit tcp_sk_exit(struct net *net)
3472 {
3473 	if (net->ipv4.tcp_congestion_control)
3474 		bpf_module_put(net->ipv4.tcp_congestion_control,
3475 			       net->ipv4.tcp_congestion_control->owner);
3476 }
3477 
3478 static void __net_init tcp_set_hashinfo(struct net *net)
3479 {
3480 	struct inet_hashinfo *hinfo;
3481 	unsigned int ehash_entries;
3482 	struct net *old_net;
3483 
3484 	if (net_eq(net, &init_net))
3485 		goto fallback;
3486 
3487 	old_net = current->nsproxy->net_ns;
3488 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3489 	if (!ehash_entries)
3490 		goto fallback;
3491 
3492 	ehash_entries = roundup_pow_of_two(ehash_entries);
3493 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3494 	if (!hinfo) {
3495 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3496 			"for a netns, fallback to the global one\n",
3497 			ehash_entries);
3498 fallback:
3499 		hinfo = &tcp_hashinfo;
3500 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3501 	}
3502 
3503 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3504 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3505 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3506 }
3507 
3508 static int __net_init tcp_sk_init(struct net *net)
3509 {
3510 	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3511 	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3512 	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3513 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3514 
3515 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3516 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3517 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3518 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3519 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3520 
3521 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3522 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3523 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3524 
3525 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3526 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3527 	net->ipv4.sysctl_tcp_syncookies = 1;
3528 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3529 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3530 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3531 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3532 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3533 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3534 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3535 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3536 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3537 
3538 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3539 	tcp_set_hashinfo(net);
3540 
3541 	net->ipv4.sysctl_tcp_sack = 1;
3542 	net->ipv4.sysctl_tcp_window_scaling = 1;
3543 	net->ipv4.sysctl_tcp_timestamps = 1;
3544 	net->ipv4.sysctl_tcp_early_retrans = 3;
3545 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3546 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3547 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3548 	net->ipv4.sysctl_tcp_max_reordering = 300;
3549 	net->ipv4.sysctl_tcp_dsack = 1;
3550 	net->ipv4.sysctl_tcp_app_win = 31;
3551 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3552 	net->ipv4.sysctl_tcp_frto = 2;
3553 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3554 	net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3555 	/* This limits the percentage of the congestion window which we
3556 	 * will allow a single TSO frame to consume.  Building TSO frames
3557 	 * which are too large can cause TCP streams to be bursty.
3558 	 */
3559 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3560 	/* Default TSQ limit of 4 MB */
3561 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3562 
3563 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3564 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3565 
3566 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3567 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3568 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3569 	net->ipv4.sysctl_tcp_autocorking = 1;
3570 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3571 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3572 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3573 	if (net != &init_net) {
3574 		memcpy(net->ipv4.sysctl_tcp_rmem,
3575 		       init_net.ipv4.sysctl_tcp_rmem,
3576 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3577 		memcpy(net->ipv4.sysctl_tcp_wmem,
3578 		       init_net.ipv4.sysctl_tcp_wmem,
3579 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3580 	}
3581 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3582 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
3583 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3584 	net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
3585 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3586 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3587 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3588 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3589 
3590 	/* Set default values for PLB */
3591 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3592 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3593 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3594 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3595 	/* Default congestion threshold for PLB to mark a round is 50% */
3596 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3597 
3598 	/* Reno is always built in */
3599 	if (!net_eq(net, &init_net) &&
3600 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3601 			       init_net.ipv4.tcp_congestion_control->owner))
3602 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3603 	else
3604 		net->ipv4.tcp_congestion_control = &tcp_reno;
3605 
3606 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3607 	net->ipv4.sysctl_tcp_shrink_window = 0;
3608 
3609 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3610 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3611 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3612 
3613 	return 0;
3614 }
3615 
3616 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3617 {
3618 	struct net *net;
3619 
3620 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3621 	 * and failed setup_net error unwinding path are serialized.
3622 	 *
3623 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3624 	 * net_exit_list, the thread that dismantles a particular twsk must
3625 	 * do so without other thread progressing to refcount_dec_and_test() of
3626 	 * tcp_death_row.tw_refcount.
3627 	 */
3628 	mutex_lock(&tcp_exit_batch_mutex);
3629 
3630 	tcp_twsk_purge(net_exit_list);
3631 
3632 	list_for_each_entry(net, net_exit_list, exit_list) {
3633 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3634 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3635 		tcp_fastopen_ctx_destroy(net);
3636 	}
3637 
3638 	mutex_unlock(&tcp_exit_batch_mutex);
3639 }
3640 
3641 static struct pernet_operations __net_initdata tcp_sk_ops = {
3642        .init	   = tcp_sk_init,
3643        .exit	   = tcp_sk_exit,
3644        .exit_batch = tcp_sk_exit_batch,
3645 };
3646 
3647 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3648 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3649 		     struct sock_common *sk_common, uid_t uid)
3650 
3651 #define INIT_BATCH_SZ 16
3652 
3653 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3654 {
3655 	struct bpf_tcp_iter_state *iter = priv_data;
3656 	int err;
3657 
3658 	err = bpf_iter_init_seq_net(priv_data, aux);
3659 	if (err)
3660 		return err;
3661 
3662 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3663 	if (err) {
3664 		bpf_iter_fini_seq_net(priv_data);
3665 		return err;
3666 	}
3667 
3668 	return 0;
3669 }
3670 
3671 static void bpf_iter_fini_tcp(void *priv_data)
3672 {
3673 	struct bpf_tcp_iter_state *iter = priv_data;
3674 
3675 	bpf_iter_fini_seq_net(priv_data);
3676 	kvfree(iter->batch);
3677 }
3678 
3679 static const struct bpf_iter_seq_info tcp_seq_info = {
3680 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3681 	.init_seq_private	= bpf_iter_init_tcp,
3682 	.fini_seq_private	= bpf_iter_fini_tcp,
3683 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3684 };
3685 
3686 static const struct bpf_func_proto *
3687 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3688 			    const struct bpf_prog *prog)
3689 {
3690 	switch (func_id) {
3691 	case BPF_FUNC_setsockopt:
3692 		return &bpf_sk_setsockopt_proto;
3693 	case BPF_FUNC_getsockopt:
3694 		return &bpf_sk_getsockopt_proto;
3695 	default:
3696 		return NULL;
3697 	}
3698 }
3699 
3700 static struct bpf_iter_reg tcp_reg_info = {
3701 	.target			= "tcp",
3702 	.ctx_arg_info_size	= 1,
3703 	.ctx_arg_info		= {
3704 		{ offsetof(struct bpf_iter__tcp, sk_common),
3705 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3706 	},
3707 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3708 	.seq_info		= &tcp_seq_info,
3709 };
3710 
3711 static void __init bpf_iter_register(void)
3712 {
3713 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3714 	if (bpf_iter_reg_target(&tcp_reg_info))
3715 		pr_warn("Warning: could not register bpf iterator tcp\n");
3716 }
3717 
3718 #endif
3719 
3720 void __init tcp_v4_init(void)
3721 {
3722 	int cpu, res;
3723 
3724 	for_each_possible_cpu(cpu) {
3725 		struct sock *sk;
3726 
3727 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3728 					   IPPROTO_TCP, &init_net);
3729 		if (res)
3730 			panic("Failed to create the TCP control socket.\n");
3731 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3732 
3733 		/* Please enforce IP_DF and IPID==0 for RST and
3734 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3735 		 */
3736 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3737 
3738 		sk->sk_clockid = CLOCK_MONOTONIC;
3739 
3740 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3741 	}
3742 	if (register_pernet_subsys(&tcp_sk_ops))
3743 		panic("Failed to create the TCP control socket.\n");
3744 
3745 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3746 	bpf_iter_register();
3747 #endif
3748 }
3749