xref: /linux/net/ipv4/tcp_ipv4.c (revision a34b0e4e21d6be3c3d620aa7f9dfbf0e9550c19e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/fips.h>
57 #include <linux/jhash.h>
58 #include <linux/init.h>
59 #include <linux/times.h>
60 #include <linux/slab.h>
61 #include <linux/sched.h>
62 #include <linux/sock_diag.h>
63 
64 #include <net/aligned_data.h>
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/tcp_ecn.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/inet_ecn.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/busy_poll.h>
78 #include <net/rstreason.h>
79 #include <net/psp.h>
80 
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86 #include <linux/inetdevice.h>
87 #include <linux/btf_ids.h>
88 #include <linux/skbuff_ref.h>
89 
90 #include <crypto/md5.h>
91 
92 #include <trace/events/tcp.h>
93 
94 #ifdef CONFIG_TCP_MD5SIG
95 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
96 				__be32 daddr, __be32 saddr, const struct tcphdr *th);
97 #endif
98 
99 struct inet_hashinfo tcp_hashinfo;
100 
101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
102 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
103 };
104 
105 static DEFINE_MUTEX(tcp_exit_batch_mutex);
106 
107 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
108 {
109 	return secure_tcp_seq(ip_hdr(skb)->daddr,
110 			      ip_hdr(skb)->saddr,
111 			      tcp_hdr(skb)->dest,
112 			      tcp_hdr(skb)->source);
113 }
114 
115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
116 {
117 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
118 }
119 
120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
121 {
122 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
123 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
124 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
125 	struct tcp_sock *tp = tcp_sk(sk);
126 	int ts_recent_stamp;
127 	u32 reuse_thresh;
128 
129 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
130 		reuse = 0;
131 
132 	if (reuse == 2) {
133 		/* Still does not detect *everything* that goes through
134 		 * lo, since we require a loopback src or dst address
135 		 * or direct binding to 'lo' interface.
136 		 */
137 		bool loopback = false;
138 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
139 			loopback = true;
140 #if IS_ENABLED(CONFIG_IPV6)
141 		if (tw->tw_family == AF_INET6) {
142 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
143 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
144 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
145 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
146 				loopback = true;
147 		} else
148 #endif
149 		{
150 			if (ipv4_is_loopback(tw->tw_daddr) ||
151 			    ipv4_is_loopback(tw->tw_rcv_saddr))
152 				loopback = true;
153 		}
154 		if (!loopback)
155 			reuse = 0;
156 	}
157 
158 	/* With PAWS, it is safe from the viewpoint
159 	   of data integrity. Even without PAWS it is safe provided sequence
160 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161 
162 	   Actually, the idea is close to VJ's one, only timestamp cache is
163 	   held not per host, but per port pair and TW bucket is used as state
164 	   holder.
165 
166 	   If TW bucket has been already destroyed we fall back to VJ's scheme
167 	   and use initial timestamp retrieved from peer table.
168 	 */
169 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
170 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
171 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
172 	if (ts_recent_stamp &&
173 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
174 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
175 		 * and releasing the bucket lock.
176 		 */
177 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
178 			return 0;
179 
180 		/* In case of repair and re-using TIME-WAIT sockets we still
181 		 * want to be sure that it is safe as above but honor the
182 		 * sequence numbers and time stamps set as part of the repair
183 		 * process.
184 		 *
185 		 * Without this check re-using a TIME-WAIT socket with TCP
186 		 * repair would accumulate a -1 on the repair assigned
187 		 * sequence number. The first time it is reused the sequence
188 		 * is -1, the second time -2, etc. This fixes that issue
189 		 * without appearing to create any others.
190 		 */
191 		if (likely(!tp->repair)) {
192 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
193 
194 			if (!seq)
195 				seq = 1;
196 			WRITE_ONCE(tp->write_seq, seq);
197 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
198 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
199 		}
200 
201 		return 1;
202 	}
203 
204 	return 0;
205 }
206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
207 
208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
209 			      int addr_len)
210 {
211 	/* This check is replicated from tcp_v4_connect() and intended to
212 	 * prevent BPF program called below from accessing bytes that are out
213 	 * of the bound specified by user in addr_len.
214 	 */
215 	if (addr_len < sizeof(struct sockaddr_in))
216 		return -EINVAL;
217 
218 	sock_owned_by_me(sk);
219 
220 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
221 }
222 
223 /* This will initiate an outgoing connection. */
224 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
225 {
226 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
227 	struct inet_timewait_death_row *tcp_death_row;
228 	struct inet_sock *inet = inet_sk(sk);
229 	struct tcp_sock *tp = tcp_sk(sk);
230 	struct ip_options_rcu *inet_opt;
231 	struct net *net = sock_net(sk);
232 	__be16 orig_sport, orig_dport;
233 	__be32 daddr, nexthop;
234 	struct flowi4 *fl4;
235 	struct rtable *rt;
236 	int err;
237 
238 	if (addr_len < sizeof(struct sockaddr_in))
239 		return -EINVAL;
240 
241 	if (usin->sin_family != AF_INET)
242 		return -EAFNOSUPPORT;
243 
244 	nexthop = daddr = usin->sin_addr.s_addr;
245 	inet_opt = rcu_dereference_protected(inet->inet_opt,
246 					     lockdep_sock_is_held(sk));
247 	if (inet_opt && inet_opt->opt.srr) {
248 		if (!daddr)
249 			return -EINVAL;
250 		nexthop = inet_opt->opt.faddr;
251 	}
252 
253 	orig_sport = inet->inet_sport;
254 	orig_dport = usin->sin_port;
255 	fl4 = &inet->cork.fl.u.ip4;
256 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
257 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
258 			      orig_dport, sk);
259 	if (IS_ERR(rt)) {
260 		err = PTR_ERR(rt);
261 		if (err == -ENETUNREACH)
262 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
263 		return err;
264 	}
265 
266 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
267 		ip_rt_put(rt);
268 		return -ENETUNREACH;
269 	}
270 
271 	if (!inet_opt || !inet_opt->opt.srr)
272 		daddr = fl4->daddr;
273 
274 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
275 
276 	if (!inet->inet_saddr) {
277 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
278 		if (err) {
279 			ip_rt_put(rt);
280 			return err;
281 		}
282 	} else {
283 		sk_rcv_saddr_set(sk, inet->inet_saddr);
284 	}
285 
286 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
287 		/* Reset inherited state */
288 		tp->rx_opt.ts_recent	   = 0;
289 		tp->rx_opt.ts_recent_stamp = 0;
290 		if (likely(!tp->repair))
291 			WRITE_ONCE(tp->write_seq, 0);
292 	}
293 
294 	inet->inet_dport = usin->sin_port;
295 	sk_daddr_set(sk, daddr);
296 
297 	inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
298 	if (inet_opt)
299 		inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
300 
301 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
302 
303 	/* Socket identity is still unknown (sport may be zero).
304 	 * However we set state to SYN-SENT and not releasing socket
305 	 * lock select source port, enter ourselves into the hash tables and
306 	 * complete initialization after this.
307 	 */
308 	tcp_set_state(sk, TCP_SYN_SENT);
309 	err = inet_hash_connect(tcp_death_row, sk);
310 	if (err)
311 		goto failure;
312 
313 	sk_set_txhash(sk);
314 
315 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
316 			       inet->inet_sport, inet->inet_dport, sk);
317 	if (IS_ERR(rt)) {
318 		err = PTR_ERR(rt);
319 		rt = NULL;
320 		goto failure;
321 	}
322 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
323 	/* OK, now commit destination to socket.  */
324 	sk->sk_gso_type = SKB_GSO_TCPV4;
325 	sk_setup_caps(sk, &rt->dst);
326 	rt = NULL;
327 
328 	if (likely(!tp->repair)) {
329 		if (!tp->write_seq)
330 			WRITE_ONCE(tp->write_seq,
331 				   secure_tcp_seq(inet->inet_saddr,
332 						  inet->inet_daddr,
333 						  inet->inet_sport,
334 						  usin->sin_port));
335 		WRITE_ONCE(tp->tsoffset,
336 			   secure_tcp_ts_off(net, inet->inet_saddr,
337 					     inet->inet_daddr));
338 	}
339 
340 	atomic_set(&inet->inet_id, get_random_u16());
341 
342 	if (tcp_fastopen_defer_connect(sk, &err))
343 		return err;
344 	if (err)
345 		goto failure;
346 
347 	err = tcp_connect(sk);
348 
349 	if (err)
350 		goto failure;
351 
352 	return 0;
353 
354 failure:
355 	/*
356 	 * This unhashes the socket and releases the local port,
357 	 * if necessary.
358 	 */
359 	tcp_set_state(sk, TCP_CLOSE);
360 	inet_bhash2_reset_saddr(sk);
361 	ip_rt_put(rt);
362 	sk->sk_route_caps = 0;
363 	inet->inet_dport = 0;
364 	return err;
365 }
366 EXPORT_IPV6_MOD(tcp_v4_connect);
367 
368 /*
369  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
370  * It can be called through tcp_release_cb() if socket was owned by user
371  * at the time tcp_v4_err() was called to handle ICMP message.
372  */
373 void tcp_v4_mtu_reduced(struct sock *sk)
374 {
375 	struct inet_sock *inet = inet_sk(sk);
376 	struct dst_entry *dst;
377 	u32 mtu, dmtu;
378 
379 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
380 		return;
381 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
382 	dst = inet_csk_update_pmtu(sk, mtu);
383 	if (!dst)
384 		return;
385 
386 	/* Something is about to be wrong... Remember soft error
387 	 * for the case, if this connection will not able to recover.
388 	 */
389 	dmtu = dst4_mtu(dst);
390 	if (mtu < dmtu && ip_dont_fragment(sk, dst))
391 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
392 
393 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
394 	    ip_sk_accept_pmtu(sk) &&
395 	    inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
396 		tcp_sync_mss(sk, dmtu);
397 
398 		/* Resend the TCP packet because it's
399 		 * clear that the old packet has been
400 		 * dropped. This is the new "fast" path mtu
401 		 * discovery.
402 		 */
403 		tcp_simple_retransmit(sk);
404 	} /* else let the usual retransmit timer handle it */
405 }
406 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
407 
408 static void do_redirect(struct sk_buff *skb, struct sock *sk)
409 {
410 	struct dst_entry *dst = __sk_dst_check(sk, 0);
411 
412 	if (dst)
413 		dst->ops->redirect(dst, sk, skb);
414 }
415 
416 
417 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
418 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
419 {
420 	struct request_sock *req = inet_reqsk(sk);
421 	struct net *net = sock_net(sk);
422 
423 	/* ICMPs are not backlogged, hence we cannot get
424 	 * an established socket here.
425 	 */
426 	if (seq != tcp_rsk(req)->snt_isn) {
427 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
428 	} else if (abort) {
429 		/*
430 		 * Still in SYN_RECV, just remove it silently.
431 		 * There is no good way to pass the error to the newly
432 		 * created socket, and POSIX does not want network
433 		 * errors returned from accept().
434 		 */
435 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
436 		tcp_listendrop(req->rsk_listener);
437 	}
438 	reqsk_put(req);
439 }
440 EXPORT_IPV6_MOD(tcp_req_err);
441 
442 /* TCP-LD (RFC 6069) logic */
443 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
444 {
445 	struct inet_connection_sock *icsk = inet_csk(sk);
446 	struct tcp_sock *tp = tcp_sk(sk);
447 	struct sk_buff *skb;
448 	s32 remaining;
449 	u32 delta_us;
450 
451 	if (sock_owned_by_user(sk))
452 		return;
453 
454 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
455 	    !icsk->icsk_backoff)
456 		return;
457 
458 	skb = tcp_rtx_queue_head(sk);
459 	if (WARN_ON_ONCE(!skb))
460 		return;
461 
462 	icsk->icsk_backoff--;
463 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
464 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
465 
466 	tcp_mstamp_refresh(tp);
467 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
468 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
469 
470 	if (remaining > 0) {
471 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
472 	} else {
473 		/* RTO revert clocked out retransmission.
474 		 * Will retransmit now.
475 		 */
476 		tcp_retransmit_timer(sk);
477 	}
478 }
479 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
480 
481 /*
482  * This routine is called by the ICMP module when it gets some
483  * sort of error condition.  If err < 0 then the socket should
484  * be closed and the error returned to the user.  If err > 0
485  * it's just the icmp type << 8 | icmp code.  After adjustment
486  * header points to the first 8 bytes of the tcp header.  We need
487  * to find the appropriate port.
488  *
489  * The locking strategy used here is very "optimistic". When
490  * someone else accesses the socket the ICMP is just dropped
491  * and for some paths there is no check at all.
492  * A more general error queue to queue errors for later handling
493  * is probably better.
494  *
495  */
496 
497 int tcp_v4_err(struct sk_buff *skb, u32 info)
498 {
499 	const struct iphdr *iph = (const struct iphdr *)skb->data;
500 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
501 	struct net *net = dev_net_rcu(skb->dev);
502 	const int type = icmp_hdr(skb)->type;
503 	const int code = icmp_hdr(skb)->code;
504 	struct request_sock *fastopen;
505 	struct tcp_sock *tp;
506 	u32 seq, snd_una;
507 	struct sock *sk;
508 	int err;
509 
510 	sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
511 				       ntohs(th->source), inet_iif(skb), 0);
512 	if (!sk) {
513 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
514 		return -ENOENT;
515 	}
516 	if (sk->sk_state == TCP_TIME_WAIT) {
517 		/* To increase the counter of ignored icmps for TCP-AO */
518 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
519 		inet_twsk_put(inet_twsk(sk));
520 		return 0;
521 	}
522 	seq = ntohl(th->seq);
523 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
524 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
525 				     type == ICMP_TIME_EXCEEDED ||
526 				     (type == ICMP_DEST_UNREACH &&
527 				      (code == ICMP_NET_UNREACH ||
528 				       code == ICMP_HOST_UNREACH)));
529 		return 0;
530 	}
531 
532 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
533 		sock_put(sk);
534 		return 0;
535 	}
536 
537 	bh_lock_sock(sk);
538 	/* If too many ICMPs get dropped on busy
539 	 * servers this needs to be solved differently.
540 	 * We do take care of PMTU discovery (RFC1191) special case :
541 	 * we can receive locally generated ICMP messages while socket is held.
542 	 */
543 	if (sock_owned_by_user(sk)) {
544 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
545 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
546 	}
547 	if (sk->sk_state == TCP_CLOSE)
548 		goto out;
549 
550 	if (static_branch_unlikely(&ip4_min_ttl)) {
551 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
552 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
553 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
554 			goto out;
555 		}
556 	}
557 
558 	tp = tcp_sk(sk);
559 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
560 	fastopen = rcu_dereference(tp->fastopen_rsk);
561 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
562 	if (sk->sk_state != TCP_LISTEN &&
563 	    !between(seq, snd_una, tp->snd_nxt)) {
564 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
565 		goto out;
566 	}
567 
568 	switch (type) {
569 	case ICMP_REDIRECT:
570 		if (!sock_owned_by_user(sk))
571 			do_redirect(skb, sk);
572 		goto out;
573 	case ICMP_SOURCE_QUENCH:
574 		/* Just silently ignore these. */
575 		goto out;
576 	case ICMP_PARAMETERPROB:
577 		err = EPROTO;
578 		break;
579 	case ICMP_DEST_UNREACH:
580 		if (code > NR_ICMP_UNREACH)
581 			goto out;
582 
583 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
584 			/* We are not interested in TCP_LISTEN and open_requests
585 			 * (SYN-ACKs send out by Linux are always <576bytes so
586 			 * they should go through unfragmented).
587 			 */
588 			if (sk->sk_state == TCP_LISTEN)
589 				goto out;
590 
591 			WRITE_ONCE(tp->mtu_info, info);
592 			if (!sock_owned_by_user(sk)) {
593 				tcp_v4_mtu_reduced(sk);
594 			} else {
595 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
596 					sock_hold(sk);
597 			}
598 			goto out;
599 		}
600 
601 		err = icmp_err_convert[code].errno;
602 		/* check if this ICMP message allows revert of backoff.
603 		 * (see RFC 6069)
604 		 */
605 		if (!fastopen &&
606 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
607 			tcp_ld_RTO_revert(sk, seq);
608 		break;
609 	case ICMP_TIME_EXCEEDED:
610 		err = EHOSTUNREACH;
611 		break;
612 	default:
613 		goto out;
614 	}
615 
616 	switch (sk->sk_state) {
617 	case TCP_SYN_SENT:
618 	case TCP_SYN_RECV:
619 		/* Only in fast or simultaneous open. If a fast open socket is
620 		 * already accepted it is treated as a connected one below.
621 		 */
622 		if (fastopen && !fastopen->sk)
623 			break;
624 
625 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
626 
627 		if (!sock_owned_by_user(sk))
628 			tcp_done_with_error(sk, err);
629 		else
630 			WRITE_ONCE(sk->sk_err_soft, err);
631 		goto out;
632 	}
633 
634 	/* If we've already connected we will keep trying
635 	 * until we time out, or the user gives up.
636 	 *
637 	 * rfc1122 4.2.3.9 allows to consider as hard errors
638 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
639 	 * but it is obsoleted by pmtu discovery).
640 	 *
641 	 * Note, that in modern internet, where routing is unreliable
642 	 * and in each dark corner broken firewalls sit, sending random
643 	 * errors ordered by their masters even this two messages finally lose
644 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
645 	 *
646 	 * Now we are in compliance with RFCs.
647 	 *							--ANK (980905)
648 	 */
649 
650 	if (!sock_owned_by_user(sk) &&
651 	    inet_test_bit(RECVERR, sk)) {
652 		WRITE_ONCE(sk->sk_err, err);
653 		sk_error_report(sk);
654 	} else	{ /* Only an error on timeout */
655 		WRITE_ONCE(sk->sk_err_soft, err);
656 	}
657 
658 out:
659 	bh_unlock_sock(sk);
660 	sock_put(sk);
661 	return 0;
662 }
663 
664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
665 {
666 	struct tcphdr *th = tcp_hdr(skb);
667 
668 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
669 	skb->csum_start = skb_transport_header(skb) - skb->head;
670 	skb->csum_offset = offsetof(struct tcphdr, check);
671 }
672 
673 /* This routine computes an IPv4 TCP checksum. */
674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
675 {
676 	const struct inet_sock *inet = inet_sk(sk);
677 
678 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
679 }
680 EXPORT_IPV6_MOD(tcp_v4_send_check);
681 
682 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
683 
684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
685 				 const struct tcp_ao_hdr *aoh,
686 				 struct ip_reply_arg *arg, struct tcphdr *reply,
687 				 __be32 reply_options[REPLY_OPTIONS_LEN])
688 {
689 #ifdef CONFIG_TCP_AO
690 	int sdif = tcp_v4_sdif(skb);
691 	int dif = inet_iif(skb);
692 	int l3index = sdif ? dif : 0;
693 	bool allocated_traffic_key;
694 	struct tcp_ao_key *key;
695 	char *traffic_key;
696 	bool drop = true;
697 	u32 ao_sne = 0;
698 	u8 keyid;
699 
700 	rcu_read_lock();
701 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
702 				 &key, &traffic_key, &allocated_traffic_key,
703 				 &keyid, &ao_sne))
704 		goto out;
705 
706 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
707 				 (aoh->rnext_keyid << 8) | keyid);
708 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
709 	reply->doff = arg->iov[0].iov_len / 4;
710 
711 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
712 			    key, traffic_key,
713 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
714 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
715 			    reply, ao_sne))
716 		goto out;
717 	drop = false;
718 out:
719 	rcu_read_unlock();
720 	if (allocated_traffic_key)
721 		kfree(traffic_key);
722 	return drop;
723 #else
724 	return true;
725 #endif
726 }
727 
728 /*
729  *	This routine will send an RST to the other tcp.
730  *
731  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
732  *		      for reset.
733  *	Answer: if a packet caused RST, it is not for a socket
734  *		existing in our system, if it is matched to a socket,
735  *		it is just duplicate segment or bug in other side's TCP.
736  *		So that we build reply only basing on parameters
737  *		arrived with segment.
738  *	Exception: precedence violation. We do not implement it in any case.
739  */
740 
741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
742 			      enum sk_rst_reason reason)
743 {
744 	const struct tcphdr *th = tcp_hdr(skb);
745 	struct {
746 		struct tcphdr th;
747 		__be32 opt[REPLY_OPTIONS_LEN];
748 	} rep;
749 	const __u8 *md5_hash_location = NULL;
750 	const struct tcp_ao_hdr *aoh;
751 	struct ip_reply_arg arg;
752 #ifdef CONFIG_TCP_MD5SIG
753 	struct tcp_md5sig_key *key = NULL;
754 	unsigned char newhash[16];
755 	struct sock *sk1 = NULL;
756 #endif
757 	u64 transmit_time = 0;
758 	struct sock *ctl_sk;
759 	struct net *net;
760 	u32 txhash = 0;
761 
762 	/* Never send a reset in response to a reset. */
763 	if (th->rst)
764 		return;
765 
766 	/* If sk not NULL, it means we did a successful lookup and incoming
767 	 * route had to be correct. prequeue might have dropped our dst.
768 	 */
769 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
770 		return;
771 
772 	/* Swap the send and the receive. */
773 	memset(&rep, 0, sizeof(rep));
774 	rep.th.dest   = th->source;
775 	rep.th.source = th->dest;
776 	rep.th.doff   = sizeof(struct tcphdr) / 4;
777 	rep.th.rst    = 1;
778 
779 	if (th->ack) {
780 		rep.th.seq = th->ack_seq;
781 	} else {
782 		rep.th.ack = 1;
783 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
784 				       skb->len - (th->doff << 2));
785 	}
786 
787 	memset(&arg, 0, sizeof(arg));
788 	arg.iov[0].iov_base = (unsigned char *)&rep;
789 	arg.iov[0].iov_len  = sizeof(rep.th);
790 
791 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
792 
793 	/* Invalid TCP option size or twice included auth */
794 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
795 		return;
796 
797 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
798 		return;
799 
800 #ifdef CONFIG_TCP_MD5SIG
801 	rcu_read_lock();
802 	if (sk && sk_fullsock(sk)) {
803 		const union tcp_md5_addr *addr;
804 		int l3index;
805 
806 		/* sdif set, means packet ingressed via a device
807 		 * in an L3 domain and inet_iif is set to it.
808 		 */
809 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
810 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
811 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
812 	} else if (md5_hash_location) {
813 		const union tcp_md5_addr *addr;
814 		int sdif = tcp_v4_sdif(skb);
815 		int dif = inet_iif(skb);
816 		int l3index;
817 
818 		/*
819 		 * active side is lost. Try to find listening socket through
820 		 * source port, and then find md5 key through listening socket.
821 		 * we are not loose security here:
822 		 * Incoming packet is checked with md5 hash with finding key,
823 		 * no RST generated if md5 hash doesn't match.
824 		 */
825 		sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
826 					     th->source, ip_hdr(skb)->daddr,
827 					     ntohs(th->source), dif, sdif);
828 		/* don't send rst if it can't find key */
829 		if (!sk1)
830 			goto out;
831 
832 		/* sdif set, means packet ingressed via a device
833 		 * in an L3 domain and dif is set to it.
834 		 */
835 		l3index = sdif ? dif : 0;
836 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
837 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
838 		if (!key)
839 			goto out;
840 
841 		tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
842 		if (memcmp(md5_hash_location, newhash, 16) != 0)
843 			goto out;
844 	}
845 
846 	if (key) {
847 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
848 				   (TCPOPT_NOP << 16) |
849 				   (TCPOPT_MD5SIG << 8) |
850 				   TCPOLEN_MD5SIG);
851 		/* Update length and the length the header thinks exists */
852 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
853 		rep.th.doff = arg.iov[0].iov_len / 4;
854 
855 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
856 				     key, ip_hdr(skb)->saddr,
857 				     ip_hdr(skb)->daddr, &rep.th);
858 	}
859 #endif
860 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
861 	if (rep.opt[0] == 0) {
862 		__be32 mrst = mptcp_reset_option(skb);
863 
864 		if (mrst) {
865 			rep.opt[0] = mrst;
866 			arg.iov[0].iov_len += sizeof(mrst);
867 			rep.th.doff = arg.iov[0].iov_len / 4;
868 		}
869 	}
870 
871 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
872 				      ip_hdr(skb)->saddr, /* XXX */
873 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
874 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
875 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
876 
877 	/* When socket is gone, all binding information is lost.
878 	 * routing might fail in this case. No choice here, if we choose to force
879 	 * input interface, we will misroute in case of asymmetric route.
880 	 */
881 	if (sk)
882 		arg.bound_dev_if = sk->sk_bound_dev_if;
883 
884 	trace_tcp_send_reset(sk, skb, reason);
885 
886 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
887 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
888 
889 	/* ECN bits of TW reset are cleared */
890 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
891 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
894 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
895 
896 	sock_net_set(ctl_sk, net);
897 	if (sk) {
898 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
899 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
900 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
901 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
902 		transmit_time = tcp_transmit_time(sk);
903 		xfrm_sk_clone_policy(ctl_sk, sk);
904 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
905 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
906 	} else {
907 		ctl_sk->sk_mark = 0;
908 		ctl_sk->sk_priority = 0;
909 	}
910 	ip_send_unicast_reply(ctl_sk, sk,
911 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
912 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913 			      &arg, arg.iov[0].iov_len,
914 			      transmit_time, txhash);
915 
916 	xfrm_sk_free_policy(ctl_sk);
917 	sock_net_set(ctl_sk, &init_net);
918 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
919 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
920 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
921 	local_bh_enable();
922 
923 #ifdef CONFIG_TCP_MD5SIG
924 out:
925 	rcu_read_unlock();
926 #endif
927 }
928 
929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
930    outside socket context is ugly, certainly. What can I do?
931  */
932 
933 static void tcp_v4_send_ack(const struct sock *sk,
934 			    struct sk_buff *skb, u32 seq, u32 ack,
935 			    u32 win, u32 tsval, u32 tsecr, int oif,
936 			    struct tcp_key *key,
937 			    int reply_flags, u8 tos, u32 txhash)
938 {
939 	const struct tcphdr *th = tcp_hdr(skb);
940 	struct {
941 		struct tcphdr th;
942 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
943 	} rep;
944 	struct net *net = sock_net(sk);
945 	struct ip_reply_arg arg;
946 	struct sock *ctl_sk;
947 	u64 transmit_time;
948 
949 	memset(&rep.th, 0, sizeof(struct tcphdr));
950 	memset(&arg, 0, sizeof(arg));
951 
952 	arg.iov[0].iov_base = (unsigned char *)&rep;
953 	arg.iov[0].iov_len  = sizeof(rep.th);
954 	if (tsecr) {
955 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
956 				   (TCPOPT_TIMESTAMP << 8) |
957 				   TCPOLEN_TIMESTAMP);
958 		rep.opt[1] = htonl(tsval);
959 		rep.opt[2] = htonl(tsecr);
960 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
961 	}
962 
963 	/* Swap the send and the receive. */
964 	rep.th.dest    = th->source;
965 	rep.th.source  = th->dest;
966 	rep.th.doff    = arg.iov[0].iov_len / 4;
967 	rep.th.seq     = htonl(seq);
968 	rep.th.ack_seq = htonl(ack);
969 	rep.th.ack     = 1;
970 	rep.th.window  = htons(win);
971 
972 #ifdef CONFIG_TCP_MD5SIG
973 	if (tcp_key_is_md5(key)) {
974 		int offset = (tsecr) ? 3 : 0;
975 
976 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
977 					  (TCPOPT_NOP << 16) |
978 					  (TCPOPT_MD5SIG << 8) |
979 					  TCPOLEN_MD5SIG);
980 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
981 		rep.th.doff = arg.iov[0].iov_len/4;
982 
983 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
984 				    key->md5_key, ip_hdr(skb)->saddr,
985 				    ip_hdr(skb)->daddr, &rep.th);
986 	}
987 #endif
988 #ifdef CONFIG_TCP_AO
989 	if (tcp_key_is_ao(key)) {
990 		int offset = (tsecr) ? 3 : 0;
991 
992 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
993 					  (tcp_ao_len(key->ao_key) << 16) |
994 					  (key->ao_key->sndid << 8) |
995 					  key->rcv_next);
996 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
997 		rep.th.doff = arg.iov[0].iov_len / 4;
998 
999 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1000 				key->ao_key, key->traffic_key,
1001 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1002 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1003 				&rep.th, key->sne);
1004 	}
1005 #endif
1006 	arg.flags = reply_flags;
1007 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1008 				      ip_hdr(skb)->saddr, /* XXX */
1009 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1010 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1011 	if (oif)
1012 		arg.bound_dev_if = oif;
1013 	arg.tos = tos;
1014 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1015 	local_bh_disable();
1016 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1017 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1018 	sock_net_set(ctl_sk, net);
1019 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1020 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1021 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1022 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1023 	transmit_time = tcp_transmit_time(sk);
1024 	ip_send_unicast_reply(ctl_sk, sk,
1025 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1026 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1027 			      &arg, arg.iov[0].iov_len,
1028 			      transmit_time, txhash);
1029 
1030 	sock_net_set(ctl_sk, &init_net);
1031 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1032 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1033 	local_bh_enable();
1034 }
1035 
1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1037 				enum tcp_tw_status tw_status)
1038 {
1039 	struct inet_timewait_sock *tw = inet_twsk(sk);
1040 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1041 	struct tcp_key key = {};
1042 	u8 tos = tw->tw_tos;
1043 
1044 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1045 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1046 	 * being placed in a different service queues (Classic rather than L4S)
1047 	 */
1048 	if (tw_status == TCP_TW_ACK_OOW)
1049 		tos &= ~INET_ECN_MASK;
1050 
1051 #ifdef CONFIG_TCP_AO
1052 	struct tcp_ao_info *ao_info;
1053 
1054 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1055 		/* FIXME: the segment to-be-acked is not verified yet */
1056 		ao_info = rcu_dereference(tcptw->ao_info);
1057 		if (ao_info) {
1058 			const struct tcp_ao_hdr *aoh;
1059 
1060 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1061 				inet_twsk_put(tw);
1062 				return;
1063 			}
1064 
1065 			if (aoh)
1066 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1067 								    aoh->rnext_keyid, -1);
1068 		}
1069 	}
1070 	if (key.ao_key) {
1071 		struct tcp_ao_key *rnext_key;
1072 
1073 		key.traffic_key = snd_other_key(key.ao_key);
1074 		key.sne = READ_ONCE(ao_info->snd_sne);
1075 		rnext_key = READ_ONCE(ao_info->rnext_key);
1076 		key.rcv_next = rnext_key->rcvid;
1077 		key.type = TCP_KEY_AO;
1078 #else
1079 	if (0) {
1080 #endif
1081 	} else if (static_branch_tcp_md5()) {
1082 		key.md5_key = tcp_twsk_md5_key(tcptw);
1083 		if (key.md5_key)
1084 			key.type = TCP_KEY_MD5;
1085 	}
1086 
1087 	tcp_v4_send_ack(sk, skb,
1088 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1089 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1090 			tcp_tw_tsval(tcptw),
1091 			READ_ONCE(tcptw->tw_ts_recent),
1092 			tw->tw_bound_dev_if, &key,
1093 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1094 			tos,
1095 			tw->tw_txhash);
1096 
1097 	inet_twsk_put(tw);
1098 }
1099 
1100 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1101 				  struct request_sock *req)
1102 {
1103 	struct tcp_key key = {};
1104 
1105 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1106 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1107 	 */
1108 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1109 					     tcp_sk(sk)->snd_nxt;
1110 
1111 #ifdef CONFIG_TCP_AO
1112 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1113 	    tcp_rsk_used_ao(req)) {
1114 		const union tcp_md5_addr *addr;
1115 		const struct tcp_ao_hdr *aoh;
1116 		int l3index;
1117 
1118 		/* Invalid TCP option size or twice included auth */
1119 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1120 			return;
1121 		if (!aoh)
1122 			return;
1123 
1124 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1125 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1126 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1127 					      aoh->rnext_keyid, -1);
1128 		if (unlikely(!key.ao_key)) {
1129 			/* Send ACK with any matching MKT for the peer */
1130 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1131 			/* Matching key disappeared (user removed the key?)
1132 			 * let the handshake timeout.
1133 			 */
1134 			if (!key.ao_key) {
1135 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1136 						     addr,
1137 						     ntohs(tcp_hdr(skb)->source),
1138 						     &ip_hdr(skb)->daddr,
1139 						     ntohs(tcp_hdr(skb)->dest));
1140 				return;
1141 			}
1142 		}
1143 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1144 		if (!key.traffic_key)
1145 			return;
1146 
1147 		key.type = TCP_KEY_AO;
1148 		key.rcv_next = aoh->keyid;
1149 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1150 #else
1151 	if (0) {
1152 #endif
1153 	} else if (static_branch_tcp_md5()) {
1154 		const union tcp_md5_addr *addr;
1155 		int l3index;
1156 
1157 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1158 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1159 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1160 		if (key.md5_key)
1161 			key.type = TCP_KEY_MD5;
1162 	}
1163 
1164 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1165 	tcp_v4_send_ack(sk, skb, seq,
1166 			tcp_rsk(req)->rcv_nxt,
1167 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1168 			tcp_rsk_tsval(tcp_rsk(req)),
1169 			req->ts_recent,
1170 			0, &key,
1171 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1172 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1173 			READ_ONCE(tcp_rsk(req)->txhash));
1174 	if (tcp_key_is_ao(&key))
1175 		kfree(key.traffic_key);
1176 }
1177 
1178 /*
1179  *	Send a SYN-ACK after having received a SYN.
1180  *	This still operates on a request_sock only, not on a big
1181  *	socket.
1182  */
1183 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1184 			      struct flowi *fl,
1185 			      struct request_sock *req,
1186 			      struct tcp_fastopen_cookie *foc,
1187 			      enum tcp_synack_type synack_type,
1188 			      struct sk_buff *syn_skb)
1189 {
1190 	struct inet_request_sock *ireq = inet_rsk(req);
1191 	struct flowi4 fl4;
1192 	int err = -1;
1193 	struct sk_buff *skb;
1194 	u8 tos;
1195 
1196 	/* First, grab a route. */
1197 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1198 		return -1;
1199 
1200 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1201 
1202 	if (skb) {
1203 		tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1204 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1205 
1206 		tos = READ_ONCE(inet_sk(sk)->tos);
1207 
1208 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1209 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1210 			      (tos & INET_ECN_MASK);
1211 
1212 		if (!INET_ECN_is_capable(tos) &&
1213 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1214 			tos |= INET_ECN_ECT_0;
1215 
1216 		rcu_read_lock();
1217 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1218 					    ireq->ir_rmt_addr,
1219 					    rcu_dereference(ireq->ireq_opt),
1220 					    tos);
1221 		rcu_read_unlock();
1222 		err = net_xmit_eval(err);
1223 	}
1224 
1225 	return err;
1226 }
1227 
1228 /*
1229  *	IPv4 request_sock destructor.
1230  */
1231 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1232 {
1233 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1234 }
1235 
1236 #ifdef CONFIG_TCP_MD5SIG
1237 /*
1238  * RFC2385 MD5 checksumming requires a mapping of
1239  * IP address->MD5 Key.
1240  * We need to maintain these in the sk structure.
1241  */
1242 
1243 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1244 EXPORT_IPV6_MOD(tcp_md5_needed);
1245 
1246 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1247 {
1248 	if (!old)
1249 		return true;
1250 
1251 	/* l3index always overrides non-l3index */
1252 	if (old->l3index && new->l3index == 0)
1253 		return false;
1254 	if (old->l3index == 0 && new->l3index)
1255 		return true;
1256 
1257 	return old->prefixlen < new->prefixlen;
1258 }
1259 
1260 /* Find the Key structure for an address.  */
1261 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1262 					   const union tcp_md5_addr *addr,
1263 					   int family, bool any_l3index)
1264 {
1265 	const struct tcp_sock *tp = tcp_sk(sk);
1266 	struct tcp_md5sig_key *key;
1267 	const struct tcp_md5sig_info *md5sig;
1268 	__be32 mask;
1269 	struct tcp_md5sig_key *best_match = NULL;
1270 	bool match;
1271 
1272 	/* caller either holds rcu_read_lock() or socket lock */
1273 	md5sig = rcu_dereference_check(tp->md5sig_info,
1274 				       lockdep_sock_is_held(sk));
1275 	if (!md5sig)
1276 		return NULL;
1277 
1278 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1279 				 lockdep_sock_is_held(sk)) {
1280 		if (key->family != family)
1281 			continue;
1282 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1283 		    key->l3index != l3index)
1284 			continue;
1285 		if (family == AF_INET) {
1286 			mask = inet_make_mask(key->prefixlen);
1287 			match = (key->addr.a4.s_addr & mask) ==
1288 				(addr->a4.s_addr & mask);
1289 #if IS_ENABLED(CONFIG_IPV6)
1290 		} else if (family == AF_INET6) {
1291 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1292 						  key->prefixlen);
1293 #endif
1294 		} else {
1295 			match = false;
1296 		}
1297 
1298 		if (match && better_md5_match(best_match, key))
1299 			best_match = key;
1300 	}
1301 	return best_match;
1302 }
1303 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1304 
1305 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1306 						      const union tcp_md5_addr *addr,
1307 						      int family, u8 prefixlen,
1308 						      int l3index, u8 flags)
1309 {
1310 	const struct tcp_sock *tp = tcp_sk(sk);
1311 	struct tcp_md5sig_key *key;
1312 	unsigned int size = sizeof(struct in_addr);
1313 	const struct tcp_md5sig_info *md5sig;
1314 
1315 	/* caller either holds rcu_read_lock() or socket lock */
1316 	md5sig = rcu_dereference_check(tp->md5sig_info,
1317 				       lockdep_sock_is_held(sk));
1318 	if (!md5sig)
1319 		return NULL;
1320 #if IS_ENABLED(CONFIG_IPV6)
1321 	if (family == AF_INET6)
1322 		size = sizeof(struct in6_addr);
1323 #endif
1324 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1325 				 lockdep_sock_is_held(sk)) {
1326 		if (key->family != family)
1327 			continue;
1328 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1329 			continue;
1330 		if (key->l3index != l3index)
1331 			continue;
1332 		if (!memcmp(&key->addr, addr, size) &&
1333 		    key->prefixlen == prefixlen)
1334 			return key;
1335 	}
1336 	return NULL;
1337 }
1338 
1339 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1340 					 const struct sock *addr_sk)
1341 {
1342 	const union tcp_md5_addr *addr;
1343 	int l3index;
1344 
1345 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1346 						 addr_sk->sk_bound_dev_if);
1347 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1348 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1349 }
1350 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1351 
1352 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1353 {
1354 	struct tcp_sock *tp = tcp_sk(sk);
1355 	struct tcp_md5sig_info *md5sig;
1356 
1357 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1358 	if (!md5sig)
1359 		return -ENOMEM;
1360 
1361 	sk_gso_disable(sk);
1362 	INIT_HLIST_HEAD(&md5sig->head);
1363 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1364 	return 0;
1365 }
1366 
1367 /* This can be called on a newly created socket, from other files */
1368 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1369 			    int family, u8 prefixlen, int l3index, u8 flags,
1370 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1371 {
1372 	/* Add Key to the list */
1373 	struct tcp_md5sig_key *key;
1374 	struct tcp_sock *tp = tcp_sk(sk);
1375 	struct tcp_md5sig_info *md5sig;
1376 
1377 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1378 	if (key) {
1379 		/* Pre-existing entry - just update that one.
1380 		 * Note that the key might be used concurrently.
1381 		 * data_race() is telling kcsan that we do not care of
1382 		 * key mismatches, since changing MD5 key on live flows
1383 		 * can lead to packet drops.
1384 		 */
1385 		data_race(memcpy(key->key, newkey, newkeylen));
1386 
1387 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1388 		 * Also note that a reader could catch new key->keylen value
1389 		 * but old key->key[], this is the reason we use __GFP_ZERO
1390 		 * at sock_kmalloc() time below these lines.
1391 		 */
1392 		WRITE_ONCE(key->keylen, newkeylen);
1393 
1394 		return 0;
1395 	}
1396 
1397 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1398 					   lockdep_sock_is_held(sk));
1399 
1400 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1401 	if (!key)
1402 		return -ENOMEM;
1403 
1404 	memcpy(key->key, newkey, newkeylen);
1405 	key->keylen = newkeylen;
1406 	key->family = family;
1407 	key->prefixlen = prefixlen;
1408 	key->l3index = l3index;
1409 	key->flags = flags;
1410 	memcpy(&key->addr, addr,
1411 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1412 								 sizeof(struct in_addr));
1413 	hlist_add_head_rcu(&key->node, &md5sig->head);
1414 	return 0;
1415 }
1416 
1417 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1418 		   int family, u8 prefixlen, int l3index, u8 flags,
1419 		   const u8 *newkey, u8 newkeylen)
1420 {
1421 	struct tcp_sock *tp = tcp_sk(sk);
1422 
1423 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1424 		if (fips_enabled) {
1425 			pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1426 			return -EOPNOTSUPP;
1427 		}
1428 
1429 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1430 			return -ENOMEM;
1431 
1432 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1433 			struct tcp_md5sig_info *md5sig;
1434 
1435 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1436 			rcu_assign_pointer(tp->md5sig_info, NULL);
1437 			kfree_rcu(md5sig, rcu);
1438 			return -EUSERS;
1439 		}
1440 	}
1441 
1442 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1443 				newkey, newkeylen, GFP_KERNEL);
1444 }
1445 EXPORT_IPV6_MOD(tcp_md5_do_add);
1446 
1447 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1448 		     int family, u8 prefixlen, int l3index,
1449 		     struct tcp_md5sig_key *key)
1450 {
1451 	struct tcp_sock *tp = tcp_sk(sk);
1452 
1453 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1454 
1455 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1456 			return -ENOMEM;
1457 
1458 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1459 			struct tcp_md5sig_info *md5sig;
1460 
1461 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1462 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1463 			rcu_assign_pointer(tp->md5sig_info, NULL);
1464 			kfree_rcu(md5sig, rcu);
1465 			return -EUSERS;
1466 		}
1467 	}
1468 
1469 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1470 				key->flags, key->key, key->keylen,
1471 				sk_gfp_mask(sk, GFP_ATOMIC));
1472 }
1473 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1474 
1475 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1476 		   u8 prefixlen, int l3index, u8 flags)
1477 {
1478 	struct tcp_md5sig_key *key;
1479 
1480 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1481 	if (!key)
1482 		return -ENOENT;
1483 	hlist_del_rcu(&key->node);
1484 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1485 	kfree_rcu(key, rcu);
1486 	return 0;
1487 }
1488 EXPORT_IPV6_MOD(tcp_md5_do_del);
1489 
1490 void tcp_clear_md5_list(struct sock *sk)
1491 {
1492 	struct tcp_sock *tp = tcp_sk(sk);
1493 	struct tcp_md5sig_key *key;
1494 	struct hlist_node *n;
1495 	struct tcp_md5sig_info *md5sig;
1496 
1497 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1498 
1499 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1500 		hlist_del(&key->node);
1501 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1502 		kfree(key);
1503 	}
1504 }
1505 
1506 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1507 				 sockptr_t optval, int optlen)
1508 {
1509 	struct tcp_md5sig cmd;
1510 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1511 	const union tcp_md5_addr *addr;
1512 	u8 prefixlen = 32;
1513 	int l3index = 0;
1514 	bool l3flag;
1515 	u8 flags;
1516 
1517 	if (optlen < sizeof(cmd))
1518 		return -EINVAL;
1519 
1520 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1521 		return -EFAULT;
1522 
1523 	if (sin->sin_family != AF_INET)
1524 		return -EINVAL;
1525 
1526 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1527 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1528 
1529 	if (optname == TCP_MD5SIG_EXT &&
1530 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1531 		prefixlen = cmd.tcpm_prefixlen;
1532 		if (prefixlen > 32)
1533 			return -EINVAL;
1534 	}
1535 
1536 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1537 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1538 		struct net_device *dev;
1539 
1540 		rcu_read_lock();
1541 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1542 		if (dev && netif_is_l3_master(dev))
1543 			l3index = dev->ifindex;
1544 
1545 		rcu_read_unlock();
1546 
1547 		/* ok to reference set/not set outside of rcu;
1548 		 * right now device MUST be an L3 master
1549 		 */
1550 		if (!dev || !l3index)
1551 			return -EINVAL;
1552 	}
1553 
1554 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1555 
1556 	if (!cmd.tcpm_keylen)
1557 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1558 
1559 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1560 		return -EINVAL;
1561 
1562 	/* Don't allow keys for peers that have a matching TCP-AO key.
1563 	 * See the comment in tcp_ao_add_cmd()
1564 	 */
1565 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1566 		return -EKEYREJECTED;
1567 
1568 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1569 			      cmd.tcpm_key, cmd.tcpm_keylen);
1570 }
1571 
1572 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1573 				    __be32 daddr, __be32 saddr,
1574 				    const struct tcphdr *th, int nbytes)
1575 {
1576 	struct {
1577 		struct tcp4_pseudohdr ip;
1578 		struct tcphdr tcp;
1579 	} h;
1580 
1581 	h.ip.saddr = saddr;
1582 	h.ip.daddr = daddr;
1583 	h.ip.pad = 0;
1584 	h.ip.protocol = IPPROTO_TCP;
1585 	h.ip.len = cpu_to_be16(nbytes);
1586 	h.tcp = *th;
1587 	h.tcp.check = 0;
1588 	md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
1589 }
1590 
1591 static noinline_for_stack void
1592 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1593 		    __be32 daddr, __be32 saddr, const struct tcphdr *th)
1594 {
1595 	struct md5_ctx ctx;
1596 
1597 	md5_init(&ctx);
1598 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
1599 	tcp_md5_hash_key(&ctx, key);
1600 	md5_final(&ctx, md5_hash);
1601 }
1602 
1603 noinline_for_stack void
1604 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1605 		    const struct sock *sk, const struct sk_buff *skb)
1606 {
1607 	const struct tcphdr *th = tcp_hdr(skb);
1608 	__be32 saddr, daddr;
1609 	struct md5_ctx ctx;
1610 
1611 	if (sk) { /* valid for establish/request sockets */
1612 		saddr = sk->sk_rcv_saddr;
1613 		daddr = sk->sk_daddr;
1614 	} else {
1615 		const struct iphdr *iph = ip_hdr(skb);
1616 		saddr = iph->saddr;
1617 		daddr = iph->daddr;
1618 	}
1619 
1620 	md5_init(&ctx);
1621 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
1622 	tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
1623 	tcp_md5_hash_key(&ctx, key);
1624 	md5_final(&ctx, md5_hash);
1625 }
1626 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1627 
1628 #endif
1629 
1630 static void tcp_v4_init_req(struct request_sock *req,
1631 			    const struct sock *sk_listener,
1632 			    struct sk_buff *skb)
1633 {
1634 	struct inet_request_sock *ireq = inet_rsk(req);
1635 	struct net *net = sock_net(sk_listener);
1636 
1637 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1638 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1639 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1640 }
1641 
1642 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1643 					  struct sk_buff *skb,
1644 					  struct flowi *fl,
1645 					  struct request_sock *req,
1646 					  u32 tw_isn)
1647 {
1648 	tcp_v4_init_req(req, sk, skb);
1649 
1650 	if (security_inet_conn_request(sk, skb, req))
1651 		return NULL;
1652 
1653 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1654 }
1655 
1656 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1657 	.family		=	PF_INET,
1658 	.obj_size	=	sizeof(struct tcp_request_sock),
1659 	.send_ack	=	tcp_v4_reqsk_send_ack,
1660 	.destructor	=	tcp_v4_reqsk_destructor,
1661 	.send_reset	=	tcp_v4_send_reset,
1662 };
1663 
1664 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1665 	.mss_clamp	=	TCP_MSS_DEFAULT,
1666 #ifdef CONFIG_TCP_MD5SIG
1667 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1668 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1669 #endif
1670 #ifdef CONFIG_TCP_AO
1671 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1672 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1673 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1674 #endif
1675 #ifdef CONFIG_SYN_COOKIES
1676 	.cookie_init_seq =	cookie_v4_init_sequence,
1677 #endif
1678 	.route_req	=	tcp_v4_route_req,
1679 	.init_seq	=	tcp_v4_init_seq,
1680 	.init_ts_off	=	tcp_v4_init_ts_off,
1681 	.send_synack	=	tcp_v4_send_synack,
1682 };
1683 
1684 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1685 {
1686 	/* Never answer to SYNs send to broadcast or multicast */
1687 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1688 		goto drop;
1689 
1690 	return tcp_conn_request(&tcp_request_sock_ops,
1691 				&tcp_request_sock_ipv4_ops, sk, skb);
1692 
1693 drop:
1694 	tcp_listendrop(sk);
1695 	return 0;
1696 }
1697 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1698 
1699 
1700 /*
1701  * The three way handshake has completed - we got a valid synack -
1702  * now create the new socket.
1703  */
1704 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1705 				  struct request_sock *req,
1706 				  struct dst_entry *dst,
1707 				  struct request_sock *req_unhash,
1708 				  bool *own_req)
1709 {
1710 	struct inet_request_sock *ireq;
1711 	bool found_dup_sk = false;
1712 	struct inet_sock *newinet;
1713 	struct tcp_sock *newtp;
1714 	struct sock *newsk;
1715 #ifdef CONFIG_TCP_MD5SIG
1716 	const union tcp_md5_addr *addr;
1717 	struct tcp_md5sig_key *key;
1718 	int l3index;
1719 #endif
1720 	struct ip_options_rcu *inet_opt;
1721 
1722 	if (sk_acceptq_is_full(sk))
1723 		goto exit_overflow;
1724 
1725 	newsk = tcp_create_openreq_child(sk, req, skb);
1726 	if (!newsk)
1727 		goto exit_nonewsk;
1728 
1729 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1730 	inet_sk_rx_dst_set(newsk, skb);
1731 
1732 	newtp		      = tcp_sk(newsk);
1733 	newinet		      = inet_sk(newsk);
1734 	ireq		      = inet_rsk(req);
1735 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1736 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1737 	newinet->mc_index     = inet_iif(skb);
1738 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1739 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1740 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1741 	if (inet_opt)
1742 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1743 	atomic_set(&newinet->inet_id, get_random_u16());
1744 
1745 	/* Set ToS of the new socket based upon the value of incoming SYN.
1746 	 * ECT bits are set later in tcp_init_transfer().
1747 	 */
1748 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1749 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1750 
1751 	if (!dst) {
1752 		dst = inet_csk_route_child_sock(sk, newsk, req);
1753 		if (!dst)
1754 			goto put_and_exit;
1755 	} else {
1756 		/* syncookie case : see end of cookie_v4_check() */
1757 	}
1758 	sk_setup_caps(newsk, dst);
1759 
1760 	tcp_ca_openreq_child(newsk, dst);
1761 
1762 	tcp_sync_mss(newsk, dst4_mtu(dst));
1763 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1764 
1765 	tcp_initialize_rcv_mss(newsk);
1766 
1767 #ifdef CONFIG_TCP_MD5SIG
1768 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1769 	/* Copy over the MD5 key from the original socket */
1770 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1771 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1772 	if (key && !tcp_rsk_used_ao(req)) {
1773 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1774 			goto put_and_exit;
1775 		sk_gso_disable(newsk);
1776 	}
1777 #endif
1778 #ifdef CONFIG_TCP_AO
1779 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1780 		goto put_and_exit; /* OOM, release back memory */
1781 #endif
1782 
1783 	if (__inet_inherit_port(sk, newsk) < 0)
1784 		goto put_and_exit;
1785 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1786 				       &found_dup_sk);
1787 	if (likely(*own_req)) {
1788 		tcp_move_syn(newtp, req);
1789 		ireq->ireq_opt = NULL;
1790 	} else {
1791 		newinet->inet_opt = NULL;
1792 
1793 		if (!req_unhash && found_dup_sk) {
1794 			/* This code path should only be executed in the
1795 			 * syncookie case only
1796 			 */
1797 			bh_unlock_sock(newsk);
1798 			sock_put(newsk);
1799 			newsk = NULL;
1800 		}
1801 	}
1802 	return newsk;
1803 
1804 exit_overflow:
1805 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1806 exit_nonewsk:
1807 	dst_release(dst);
1808 exit:
1809 	tcp_listendrop(sk);
1810 	return NULL;
1811 put_and_exit:
1812 	newinet->inet_opt = NULL;
1813 	inet_csk_prepare_forced_close(newsk);
1814 	tcp_done(newsk);
1815 	goto exit;
1816 }
1817 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1818 
1819 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1820 {
1821 #ifdef CONFIG_SYN_COOKIES
1822 	const struct tcphdr *th = tcp_hdr(skb);
1823 
1824 	if (!th->syn)
1825 		sk = cookie_v4_check(sk, skb);
1826 #endif
1827 	return sk;
1828 }
1829 
1830 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1831 			 struct tcphdr *th, u32 *cookie)
1832 {
1833 	u16 mss = 0;
1834 #ifdef CONFIG_SYN_COOKIES
1835 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1836 				    &tcp_request_sock_ipv4_ops, sk, th);
1837 	if (mss) {
1838 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1839 		tcp_synq_overflow(sk);
1840 	}
1841 #endif
1842 	return mss;
1843 }
1844 
1845 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1846 							   u32));
1847 /* The socket must have it's spinlock held when we get
1848  * here, unless it is a TCP_LISTEN socket.
1849  *
1850  * We have a potential double-lock case here, so even when
1851  * doing backlog processing we use the BH locking scheme.
1852  * This is because we cannot sleep with the original spinlock
1853  * held.
1854  */
1855 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1856 {
1857 	enum skb_drop_reason reason;
1858 	struct sock *rsk;
1859 
1860 	reason = psp_sk_rx_policy_check(sk, skb);
1861 	if (reason)
1862 		goto err_discard;
1863 
1864 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1865 		struct dst_entry *dst;
1866 
1867 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1868 						lockdep_sock_is_held(sk));
1869 
1870 		sock_rps_save_rxhash(sk, skb);
1871 		sk_mark_napi_id(sk, skb);
1872 		if (dst) {
1873 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1874 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1875 					     dst, 0)) {
1876 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1877 				dst_release(dst);
1878 			}
1879 		}
1880 		tcp_rcv_established(sk, skb);
1881 		return 0;
1882 	}
1883 
1884 	if (tcp_checksum_complete(skb))
1885 		goto csum_err;
1886 
1887 	if (sk->sk_state == TCP_LISTEN) {
1888 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1889 
1890 		if (!nsk)
1891 			return 0;
1892 		if (nsk != sk) {
1893 			reason = tcp_child_process(sk, nsk, skb);
1894 			if (reason) {
1895 				rsk = nsk;
1896 				goto reset;
1897 			}
1898 			return 0;
1899 		}
1900 	} else
1901 		sock_rps_save_rxhash(sk, skb);
1902 
1903 	reason = tcp_rcv_state_process(sk, skb);
1904 	if (reason) {
1905 		rsk = sk;
1906 		goto reset;
1907 	}
1908 	return 0;
1909 
1910 reset:
1911 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1912 discard:
1913 	sk_skb_reason_drop(sk, skb, reason);
1914 	/* Be careful here. If this function gets more complicated and
1915 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1916 	 * might be destroyed here. This current version compiles correctly,
1917 	 * but you have been warned.
1918 	 */
1919 	return 0;
1920 
1921 csum_err:
1922 	reason = SKB_DROP_REASON_TCP_CSUM;
1923 	trace_tcp_bad_csum(skb);
1924 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1925 err_discard:
1926 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1927 	goto discard;
1928 }
1929 EXPORT_SYMBOL(tcp_v4_do_rcv);
1930 
1931 int tcp_v4_early_demux(struct sk_buff *skb)
1932 {
1933 	struct net *net = dev_net_rcu(skb->dev);
1934 	const struct iphdr *iph;
1935 	const struct tcphdr *th;
1936 	struct sock *sk;
1937 
1938 	if (skb->pkt_type != PACKET_HOST)
1939 		return 0;
1940 
1941 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1942 		return 0;
1943 
1944 	iph = ip_hdr(skb);
1945 	th = tcp_hdr(skb);
1946 
1947 	if (th->doff < sizeof(struct tcphdr) / 4)
1948 		return 0;
1949 
1950 	sk = __inet_lookup_established(net, iph->saddr, th->source,
1951 				       iph->daddr, ntohs(th->dest),
1952 				       skb->skb_iif, inet_sdif(skb));
1953 	if (sk) {
1954 		skb->sk = sk;
1955 		skb->destructor = sock_edemux;
1956 		if (sk_fullsock(sk)) {
1957 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1958 
1959 			if (dst)
1960 				dst = dst_check(dst, 0);
1961 			if (dst &&
1962 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1963 				skb_dst_set_noref(skb, dst);
1964 		}
1965 	}
1966 	return 0;
1967 }
1968 
1969 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1970 		     enum skb_drop_reason *reason)
1971 {
1972 	u32 tail_gso_size, tail_gso_segs;
1973 	struct skb_shared_info *shinfo;
1974 	const struct tcphdr *th;
1975 	struct tcphdr *thtail;
1976 	struct sk_buff *tail;
1977 	unsigned int hdrlen;
1978 	bool fragstolen;
1979 	u32 gso_segs;
1980 	u32 gso_size;
1981 	u64 limit;
1982 	int delta;
1983 	int err;
1984 
1985 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1986 	 * we can fix skb->truesize to its real value to avoid future drops.
1987 	 * This is valid because skb is not yet charged to the socket.
1988 	 * It has been noticed pure SACK packets were sometimes dropped
1989 	 * (if cooked by drivers without copybreak feature).
1990 	 */
1991 	skb_condense(skb);
1992 
1993 	tcp_cleanup_skb(skb);
1994 
1995 	if (unlikely(tcp_checksum_complete(skb))) {
1996 		bh_unlock_sock(sk);
1997 		trace_tcp_bad_csum(skb);
1998 		*reason = SKB_DROP_REASON_TCP_CSUM;
1999 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2000 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2001 		return true;
2002 	}
2003 
2004 	/* Attempt coalescing to last skb in backlog, even if we are
2005 	 * above the limits.
2006 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2007 	 */
2008 	th = (const struct tcphdr *)skb->data;
2009 	hdrlen = th->doff * 4;
2010 
2011 	tail = sk->sk_backlog.tail;
2012 	if (!tail)
2013 		goto no_coalesce;
2014 	thtail = (struct tcphdr *)tail->data;
2015 
2016 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2017 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2018 	    ((TCP_SKB_CB(tail)->tcp_flags |
2019 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2020 	    !((TCP_SKB_CB(tail)->tcp_flags &
2021 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2022 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2023 	      TCP_SKB_CB(skb)->tcp_flags) &
2024 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2025 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2026 	    thtail->doff != th->doff ||
2027 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
2028 	    /* prior to PSP Rx policy check, retain exact PSP metadata */
2029 	    psp_skb_coalesce_diff(tail, skb))
2030 		goto no_coalesce;
2031 
2032 	__skb_pull(skb, hdrlen);
2033 
2034 	shinfo = skb_shinfo(skb);
2035 	gso_size = shinfo->gso_size ?: skb->len;
2036 	gso_segs = shinfo->gso_segs ?: 1;
2037 
2038 	shinfo = skb_shinfo(tail);
2039 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2040 	tail_gso_segs = shinfo->gso_segs ?: 1;
2041 
2042 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2043 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2044 
2045 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2046 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2047 			thtail->window = th->window;
2048 		}
2049 
2050 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2051 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2052 		 * is not entered if we append a packet with a FIN.
2053 		 * SYN, RST, URG are not present.
2054 		 * ACK is set on both packets.
2055 		 * PSH : we do not really care in TCP stack,
2056 		 *       at least for 'GRO' packets.
2057 		 */
2058 		thtail->fin |= th->fin;
2059 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2060 
2061 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2062 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2063 			tail->tstamp = skb->tstamp;
2064 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2065 		}
2066 
2067 		/* Not as strict as GRO. We only need to carry mss max value */
2068 		shinfo->gso_size = max(gso_size, tail_gso_size);
2069 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2070 
2071 		sk->sk_backlog.len += delta;
2072 		__NET_INC_STATS(sock_net(sk),
2073 				LINUX_MIB_TCPBACKLOGCOALESCE);
2074 		kfree_skb_partial(skb, fragstolen);
2075 		return false;
2076 	}
2077 	__skb_push(skb, hdrlen);
2078 
2079 no_coalesce:
2080 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2081 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2082 	 * sk_rcvbuf in normal conditions.
2083 	 */
2084 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2085 
2086 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2087 
2088 	/* Only socket owner can try to collapse/prune rx queues
2089 	 * to reduce memory overhead, so add a little headroom here.
2090 	 * Few sockets backlog are possibly concurrently non empty.
2091 	 */
2092 	limit += 64 * 1024;
2093 
2094 	limit = min_t(u64, limit, UINT_MAX);
2095 
2096 	err = sk_add_backlog(sk, skb, limit);
2097 	if (unlikely(err)) {
2098 		bh_unlock_sock(sk);
2099 		if (err == -ENOMEM) {
2100 			*reason = SKB_DROP_REASON_PFMEMALLOC;
2101 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2102 		} else {
2103 			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2104 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2105 		}
2106 		return true;
2107 	}
2108 	return false;
2109 }
2110 EXPORT_IPV6_MOD(tcp_add_backlog);
2111 
2112 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
2113 {
2114 	struct tcphdr *th = (struct tcphdr *)skb->data;
2115 
2116 	return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
2117 }
2118 EXPORT_IPV6_MOD(tcp_filter);
2119 
2120 static void tcp_v4_restore_cb(struct sk_buff *skb)
2121 {
2122 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2123 		sizeof(struct inet_skb_parm));
2124 }
2125 
2126 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2127 			   const struct tcphdr *th)
2128 {
2129 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2130 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2131 	 */
2132 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2133 		sizeof(struct inet_skb_parm));
2134 	barrier();
2135 
2136 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2137 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2138 				    skb->len - th->doff * 4);
2139 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2140 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2141 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2142 	TCP_SKB_CB(skb)->sacked	 = 0;
2143 	TCP_SKB_CB(skb)->has_rxtstamp =
2144 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2145 }
2146 
2147 /*
2148  *	From tcp_input.c
2149  */
2150 
2151 int tcp_v4_rcv(struct sk_buff *skb)
2152 {
2153 	struct net *net = dev_net_rcu(skb->dev);
2154 	enum skb_drop_reason drop_reason;
2155 	enum tcp_tw_status tw_status;
2156 	int sdif = inet_sdif(skb);
2157 	int dif = inet_iif(skb);
2158 	const struct iphdr *iph;
2159 	const struct tcphdr *th;
2160 	struct sock *sk = NULL;
2161 	bool refcounted;
2162 	int ret;
2163 	u32 isn;
2164 
2165 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2166 	if (skb->pkt_type != PACKET_HOST)
2167 		goto discard_it;
2168 
2169 	/* Count it even if it's bad */
2170 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2171 
2172 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2173 		goto discard_it;
2174 
2175 	th = (const struct tcphdr *)skb->data;
2176 
2177 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2178 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2179 		goto bad_packet;
2180 	}
2181 	if (!pskb_may_pull(skb, th->doff * 4))
2182 		goto discard_it;
2183 
2184 	/* An explanation is required here, I think.
2185 	 * Packet length and doff are validated by header prediction,
2186 	 * provided case of th->doff==0 is eliminated.
2187 	 * So, we defer the checks. */
2188 
2189 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2190 		goto csum_error;
2191 
2192 	th = (const struct tcphdr *)skb->data;
2193 	iph = ip_hdr(skb);
2194 lookup:
2195 	sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2196 			       th->dest, sdif, &refcounted);
2197 	if (!sk)
2198 		goto no_tcp_socket;
2199 
2200 	if (sk->sk_state == TCP_TIME_WAIT)
2201 		goto do_time_wait;
2202 
2203 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2204 		struct request_sock *req = inet_reqsk(sk);
2205 		bool req_stolen = false;
2206 		struct sock *nsk;
2207 
2208 		sk = req->rsk_listener;
2209 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2210 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2211 		else
2212 			drop_reason = tcp_inbound_hash(sk, req, skb,
2213 						       &iph->saddr, &iph->daddr,
2214 						       AF_INET, dif, sdif);
2215 		if (unlikely(drop_reason)) {
2216 			sk_drops_skbadd(sk, skb);
2217 			reqsk_put(req);
2218 			goto discard_it;
2219 		}
2220 		if (tcp_checksum_complete(skb)) {
2221 			reqsk_put(req);
2222 			goto csum_error;
2223 		}
2224 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2225 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2226 			if (!nsk) {
2227 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2228 				goto lookup;
2229 			}
2230 			sk = nsk;
2231 			/* reuseport_migrate_sock() has already held one sk_refcnt
2232 			 * before returning.
2233 			 */
2234 		} else {
2235 			/* We own a reference on the listener, increase it again
2236 			 * as we might lose it too soon.
2237 			 */
2238 			sock_hold(sk);
2239 		}
2240 		refcounted = true;
2241 		nsk = NULL;
2242 		if (!tcp_filter(sk, skb, &drop_reason)) {
2243 			th = (const struct tcphdr *)skb->data;
2244 			iph = ip_hdr(skb);
2245 			tcp_v4_fill_cb(skb, iph, th);
2246 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2247 					    &drop_reason);
2248 		}
2249 		if (!nsk) {
2250 			reqsk_put(req);
2251 			if (req_stolen) {
2252 				/* Another cpu got exclusive access to req
2253 				 * and created a full blown socket.
2254 				 * Try to feed this packet to this socket
2255 				 * instead of discarding it.
2256 				 */
2257 				tcp_v4_restore_cb(skb);
2258 				sock_put(sk);
2259 				goto lookup;
2260 			}
2261 			goto discard_and_relse;
2262 		}
2263 		nf_reset_ct(skb);
2264 		if (nsk == sk) {
2265 			reqsk_put(req);
2266 			tcp_v4_restore_cb(skb);
2267 		} else {
2268 			drop_reason = tcp_child_process(sk, nsk, skb);
2269 			if (drop_reason) {
2270 				enum sk_rst_reason rst_reason;
2271 
2272 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2273 				tcp_v4_send_reset(nsk, skb, rst_reason);
2274 				goto discard_and_relse;
2275 			}
2276 			sock_put(sk);
2277 			return 0;
2278 		}
2279 	}
2280 
2281 process:
2282 	if (static_branch_unlikely(&ip4_min_ttl)) {
2283 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2284 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2285 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2286 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2287 			goto discard_and_relse;
2288 		}
2289 	}
2290 
2291 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2292 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2293 		goto discard_and_relse;
2294 	}
2295 
2296 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2297 				       AF_INET, dif, sdif);
2298 	if (drop_reason)
2299 		goto discard_and_relse;
2300 
2301 	nf_reset_ct(skb);
2302 
2303 	if (tcp_filter(sk, skb, &drop_reason))
2304 		goto discard_and_relse;
2305 
2306 	th = (const struct tcphdr *)skb->data;
2307 	iph = ip_hdr(skb);
2308 	tcp_v4_fill_cb(skb, iph, th);
2309 
2310 	skb->dev = NULL;
2311 
2312 	if (sk->sk_state == TCP_LISTEN) {
2313 		ret = tcp_v4_do_rcv(sk, skb);
2314 		goto put_and_return;
2315 	}
2316 
2317 	sk_incoming_cpu_update(sk);
2318 
2319 	bh_lock_sock_nested(sk);
2320 	tcp_segs_in(tcp_sk(sk), skb);
2321 	ret = 0;
2322 	if (!sock_owned_by_user(sk)) {
2323 		ret = tcp_v4_do_rcv(sk, skb);
2324 	} else {
2325 		if (tcp_add_backlog(sk, skb, &drop_reason))
2326 			goto discard_and_relse;
2327 	}
2328 	bh_unlock_sock(sk);
2329 
2330 put_and_return:
2331 	if (refcounted)
2332 		sock_put(sk);
2333 
2334 	return ret;
2335 
2336 no_tcp_socket:
2337 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2338 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2339 		goto discard_it;
2340 
2341 	tcp_v4_fill_cb(skb, iph, th);
2342 
2343 	if (tcp_checksum_complete(skb)) {
2344 csum_error:
2345 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2346 		trace_tcp_bad_csum(skb);
2347 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2348 bad_packet:
2349 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2350 	} else {
2351 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2352 	}
2353 
2354 discard_it:
2355 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2356 	/* Discard frame. */
2357 	sk_skb_reason_drop(sk, skb, drop_reason);
2358 	return 0;
2359 
2360 discard_and_relse:
2361 	sk_drops_skbadd(sk, skb);
2362 	if (refcounted)
2363 		sock_put(sk);
2364 	goto discard_it;
2365 
2366 do_time_wait:
2367 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2368 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2369 		inet_twsk_put(inet_twsk(sk));
2370 		goto discard_it;
2371 	}
2372 
2373 	tcp_v4_fill_cb(skb, iph, th);
2374 
2375 	if (tcp_checksum_complete(skb)) {
2376 		inet_twsk_put(inet_twsk(sk));
2377 		goto csum_error;
2378 	}
2379 
2380 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2381 					       &drop_reason);
2382 	switch (tw_status) {
2383 	case TCP_TW_SYN: {
2384 		struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2385 							iph->saddr, th->source,
2386 							iph->daddr, th->dest,
2387 							inet_iif(skb),
2388 							sdif);
2389 		if (sk2) {
2390 			inet_twsk_deschedule_put(inet_twsk(sk));
2391 			sk = sk2;
2392 			tcp_v4_restore_cb(skb);
2393 			refcounted = false;
2394 			__this_cpu_write(tcp_tw_isn, isn);
2395 			goto process;
2396 		}
2397 
2398 		drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2399 		if (drop_reason)
2400 			break;
2401 	}
2402 		/* to ACK */
2403 		fallthrough;
2404 	case TCP_TW_ACK:
2405 	case TCP_TW_ACK_OOW:
2406 		tcp_v4_timewait_ack(sk, skb, tw_status);
2407 		break;
2408 	case TCP_TW_RST:
2409 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2410 		inet_twsk_deschedule_put(inet_twsk(sk));
2411 		goto discard_it;
2412 	case TCP_TW_SUCCESS:;
2413 	}
2414 	goto discard_it;
2415 }
2416 
2417 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2418 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2419 };
2420 
2421 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2422 {
2423 	struct dst_entry *dst = skb_dst(skb);
2424 
2425 	if (dst && dst_hold_safe(dst)) {
2426 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2427 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2428 	}
2429 }
2430 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2431 
2432 const struct inet_connection_sock_af_ops ipv4_specific = {
2433 	.queue_xmit	   = ip_queue_xmit,
2434 	.send_check	   = tcp_v4_send_check,
2435 	.rebuild_header	   = inet_sk_rebuild_header,
2436 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2437 	.conn_request	   = tcp_v4_conn_request,
2438 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2439 	.net_header_len	   = sizeof(struct iphdr),
2440 	.setsockopt	   = ip_setsockopt,
2441 	.getsockopt	   = ip_getsockopt,
2442 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2443 };
2444 EXPORT_IPV6_MOD(ipv4_specific);
2445 
2446 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2447 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2448 #ifdef CONFIG_TCP_MD5SIG
2449 	.md5_lookup		= tcp_v4_md5_lookup,
2450 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2451 	.md5_parse		= tcp_v4_parse_md5_keys,
2452 #endif
2453 #ifdef CONFIG_TCP_AO
2454 	.ao_lookup		= tcp_v4_ao_lookup,
2455 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2456 	.ao_parse		= tcp_v4_parse_ao,
2457 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2458 #endif
2459 };
2460 
2461 static void tcp4_destruct_sock(struct sock *sk)
2462 {
2463 	tcp_md5_destruct_sock(sk);
2464 	tcp_ao_destroy_sock(sk, false);
2465 	inet_sock_destruct(sk);
2466 }
2467 #endif
2468 
2469 /* NOTE: A lot of things set to zero explicitly by call to
2470  *       sk_alloc() so need not be done here.
2471  */
2472 static int tcp_v4_init_sock(struct sock *sk)
2473 {
2474 	struct inet_connection_sock *icsk = inet_csk(sk);
2475 
2476 	tcp_init_sock(sk);
2477 
2478 	icsk->icsk_af_ops = &ipv4_specific;
2479 
2480 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2481 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2482 	sk->sk_destruct = tcp4_destruct_sock;
2483 #endif
2484 
2485 	return 0;
2486 }
2487 
2488 static void tcp_release_user_frags(struct sock *sk)
2489 {
2490 #ifdef CONFIG_PAGE_POOL
2491 	unsigned long index;
2492 	void *netmem;
2493 
2494 	xa_for_each(&sk->sk_user_frags, index, netmem)
2495 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2496 #endif
2497 }
2498 
2499 void tcp_v4_destroy_sock(struct sock *sk)
2500 {
2501 	struct tcp_sock *tp = tcp_sk(sk);
2502 
2503 	tcp_release_user_frags(sk);
2504 
2505 	xa_destroy(&sk->sk_user_frags);
2506 
2507 	trace_tcp_destroy_sock(sk);
2508 
2509 	tcp_clear_xmit_timers(sk);
2510 
2511 	tcp_cleanup_congestion_control(sk);
2512 
2513 	tcp_cleanup_ulp(sk);
2514 
2515 	/* Cleanup up the write buffer. */
2516 	tcp_write_queue_purge(sk);
2517 
2518 	/* Check if we want to disable active TFO */
2519 	tcp_fastopen_active_disable_ofo_check(sk);
2520 
2521 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2522 	skb_rbtree_purge(&tp->out_of_order_queue);
2523 
2524 	/* Clean up a referenced TCP bind bucket. */
2525 	if (inet_csk(sk)->icsk_bind_hash)
2526 		inet_put_port(sk);
2527 
2528 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2529 
2530 	/* If socket is aborted during connect operation */
2531 	tcp_free_fastopen_req(tp);
2532 	tcp_fastopen_destroy_cipher(sk);
2533 	tcp_saved_syn_free(tp);
2534 
2535 	sk_sockets_allocated_dec(sk);
2536 }
2537 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2538 
2539 #ifdef CONFIG_PROC_FS
2540 /* Proc filesystem TCP sock list dumping. */
2541 
2542 static unsigned short seq_file_family(const struct seq_file *seq);
2543 
2544 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2545 {
2546 	unsigned short family = seq_file_family(seq);
2547 
2548 	/* AF_UNSPEC is used as a match all */
2549 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2550 		net_eq(sock_net(sk), seq_file_net(seq)));
2551 }
2552 
2553 /* Find a non empty bucket (starting from st->bucket)
2554  * and return the first sk from it.
2555  */
2556 static void *listening_get_first(struct seq_file *seq)
2557 {
2558 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2559 	struct tcp_iter_state *st = seq->private;
2560 
2561 	st->offset = 0;
2562 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2563 		struct inet_listen_hashbucket *ilb2;
2564 		struct hlist_nulls_node *node;
2565 		struct sock *sk;
2566 
2567 		ilb2 = &hinfo->lhash2[st->bucket];
2568 		if (hlist_nulls_empty(&ilb2->nulls_head))
2569 			continue;
2570 
2571 		spin_lock(&ilb2->lock);
2572 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2573 			if (seq_sk_match(seq, sk))
2574 				return sk;
2575 		}
2576 		spin_unlock(&ilb2->lock);
2577 	}
2578 
2579 	return NULL;
2580 }
2581 
2582 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2583  * If "cur" is the last one in the st->bucket,
2584  * call listening_get_first() to return the first sk of the next
2585  * non empty bucket.
2586  */
2587 static void *listening_get_next(struct seq_file *seq, void *cur)
2588 {
2589 	struct tcp_iter_state *st = seq->private;
2590 	struct inet_listen_hashbucket *ilb2;
2591 	struct hlist_nulls_node *node;
2592 	struct inet_hashinfo *hinfo;
2593 	struct sock *sk = cur;
2594 
2595 	++st->num;
2596 	++st->offset;
2597 
2598 	sk = sk_nulls_next(sk);
2599 	sk_nulls_for_each_from(sk, node) {
2600 		if (seq_sk_match(seq, sk))
2601 			return sk;
2602 	}
2603 
2604 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2605 	ilb2 = &hinfo->lhash2[st->bucket];
2606 	spin_unlock(&ilb2->lock);
2607 	++st->bucket;
2608 	return listening_get_first(seq);
2609 }
2610 
2611 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2612 {
2613 	struct tcp_iter_state *st = seq->private;
2614 	void *rc;
2615 
2616 	st->bucket = 0;
2617 	st->offset = 0;
2618 	rc = listening_get_first(seq);
2619 
2620 	while (rc && *pos) {
2621 		rc = listening_get_next(seq, rc);
2622 		--*pos;
2623 	}
2624 	return rc;
2625 }
2626 
2627 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2628 				const struct tcp_iter_state *st)
2629 {
2630 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2631 }
2632 
2633 /*
2634  * Get first established socket starting from bucket given in st->bucket.
2635  * If st->bucket is zero, the very first socket in the hash is returned.
2636  */
2637 static void *established_get_first(struct seq_file *seq)
2638 {
2639 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2640 	struct tcp_iter_state *st = seq->private;
2641 
2642 	st->offset = 0;
2643 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2644 		struct sock *sk;
2645 		struct hlist_nulls_node *node;
2646 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2647 
2648 		cond_resched();
2649 
2650 		/* Lockless fast path for the common case of empty buckets */
2651 		if (empty_bucket(hinfo, st))
2652 			continue;
2653 
2654 		spin_lock_bh(lock);
2655 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2656 			if (seq_sk_match(seq, sk))
2657 				return sk;
2658 		}
2659 		spin_unlock_bh(lock);
2660 	}
2661 
2662 	return NULL;
2663 }
2664 
2665 static void *established_get_next(struct seq_file *seq, void *cur)
2666 {
2667 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2668 	struct tcp_iter_state *st = seq->private;
2669 	struct hlist_nulls_node *node;
2670 	struct sock *sk = cur;
2671 
2672 	++st->num;
2673 	++st->offset;
2674 
2675 	sk = sk_nulls_next(sk);
2676 
2677 	sk_nulls_for_each_from(sk, node) {
2678 		if (seq_sk_match(seq, sk))
2679 			return sk;
2680 	}
2681 
2682 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2683 	++st->bucket;
2684 	return established_get_first(seq);
2685 }
2686 
2687 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2688 {
2689 	struct tcp_iter_state *st = seq->private;
2690 	void *rc;
2691 
2692 	st->bucket = 0;
2693 	rc = established_get_first(seq);
2694 
2695 	while (rc && pos) {
2696 		rc = established_get_next(seq, rc);
2697 		--pos;
2698 	}
2699 	return rc;
2700 }
2701 
2702 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2703 {
2704 	void *rc;
2705 	struct tcp_iter_state *st = seq->private;
2706 
2707 	st->state = TCP_SEQ_STATE_LISTENING;
2708 	rc	  = listening_get_idx(seq, &pos);
2709 
2710 	if (!rc) {
2711 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2712 		rc	  = established_get_idx(seq, pos);
2713 	}
2714 
2715 	return rc;
2716 }
2717 
2718 static void *tcp_seek_last_pos(struct seq_file *seq)
2719 {
2720 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2721 	struct tcp_iter_state *st = seq->private;
2722 	int bucket = st->bucket;
2723 	int offset = st->offset;
2724 	int orig_num = st->num;
2725 	void *rc = NULL;
2726 
2727 	switch (st->state) {
2728 	case TCP_SEQ_STATE_LISTENING:
2729 		if (st->bucket > hinfo->lhash2_mask)
2730 			break;
2731 		rc = listening_get_first(seq);
2732 		while (offset-- && rc && bucket == st->bucket)
2733 			rc = listening_get_next(seq, rc);
2734 		if (rc)
2735 			break;
2736 		st->bucket = 0;
2737 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2738 		fallthrough;
2739 	case TCP_SEQ_STATE_ESTABLISHED:
2740 		if (st->bucket > hinfo->ehash_mask)
2741 			break;
2742 		rc = established_get_first(seq);
2743 		while (offset-- && rc && bucket == st->bucket)
2744 			rc = established_get_next(seq, rc);
2745 	}
2746 
2747 	st->num = orig_num;
2748 
2749 	return rc;
2750 }
2751 
2752 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2753 {
2754 	struct tcp_iter_state *st = seq->private;
2755 	void *rc;
2756 
2757 	if (*pos && *pos == st->last_pos) {
2758 		rc = tcp_seek_last_pos(seq);
2759 		if (rc)
2760 			goto out;
2761 	}
2762 
2763 	st->state = TCP_SEQ_STATE_LISTENING;
2764 	st->num = 0;
2765 	st->bucket = 0;
2766 	st->offset = 0;
2767 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2768 
2769 out:
2770 	st->last_pos = *pos;
2771 	return rc;
2772 }
2773 EXPORT_IPV6_MOD(tcp_seq_start);
2774 
2775 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2776 {
2777 	struct tcp_iter_state *st = seq->private;
2778 	void *rc = NULL;
2779 
2780 	if (v == SEQ_START_TOKEN) {
2781 		rc = tcp_get_idx(seq, 0);
2782 		goto out;
2783 	}
2784 
2785 	switch (st->state) {
2786 	case TCP_SEQ_STATE_LISTENING:
2787 		rc = listening_get_next(seq, v);
2788 		if (!rc) {
2789 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2790 			st->bucket = 0;
2791 			st->offset = 0;
2792 			rc	  = established_get_first(seq);
2793 		}
2794 		break;
2795 	case TCP_SEQ_STATE_ESTABLISHED:
2796 		rc = established_get_next(seq, v);
2797 		break;
2798 	}
2799 out:
2800 	++*pos;
2801 	st->last_pos = *pos;
2802 	return rc;
2803 }
2804 EXPORT_IPV6_MOD(tcp_seq_next);
2805 
2806 void tcp_seq_stop(struct seq_file *seq, void *v)
2807 {
2808 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2809 	struct tcp_iter_state *st = seq->private;
2810 
2811 	switch (st->state) {
2812 	case TCP_SEQ_STATE_LISTENING:
2813 		if (v != SEQ_START_TOKEN)
2814 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2815 		break;
2816 	case TCP_SEQ_STATE_ESTABLISHED:
2817 		if (v)
2818 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2819 		break;
2820 	}
2821 }
2822 EXPORT_IPV6_MOD(tcp_seq_stop);
2823 
2824 static void get_openreq4(const struct request_sock *req,
2825 			 struct seq_file *f, int i)
2826 {
2827 	const struct inet_request_sock *ireq = inet_rsk(req);
2828 	long delta = req->rsk_timer.expires - jiffies;
2829 
2830 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2831 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2832 		i,
2833 		ireq->ir_loc_addr,
2834 		ireq->ir_num,
2835 		ireq->ir_rmt_addr,
2836 		ntohs(ireq->ir_rmt_port),
2837 		TCP_SYN_RECV,
2838 		0, 0, /* could print option size, but that is af dependent. */
2839 		1,    /* timers active (only the expire timer) */
2840 		jiffies_delta_to_clock_t(delta),
2841 		req->num_timeout,
2842 		from_kuid_munged(seq_user_ns(f),
2843 				 sk_uid(req->rsk_listener)),
2844 		0,  /* non standard timer */
2845 		0, /* open_requests have no inode */
2846 		0,
2847 		req);
2848 }
2849 
2850 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2851 {
2852 	int timer_active;
2853 	unsigned long timer_expires;
2854 	const struct tcp_sock *tp = tcp_sk(sk);
2855 	const struct inet_connection_sock *icsk = inet_csk(sk);
2856 	const struct inet_sock *inet = inet_sk(sk);
2857 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2858 	__be32 dest = inet->inet_daddr;
2859 	__be32 src = inet->inet_rcv_saddr;
2860 	__u16 destp = ntohs(inet->inet_dport);
2861 	__u16 srcp = ntohs(inet->inet_sport);
2862 	u8 icsk_pending;
2863 	int rx_queue;
2864 	int state;
2865 
2866 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2867 	if (icsk_pending == ICSK_TIME_RETRANS ||
2868 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2869 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2870 		timer_active	= 1;
2871 		timer_expires	= tcp_timeout_expires(sk);
2872 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2873 		timer_active	= 4;
2874 		timer_expires	= tcp_timeout_expires(sk);
2875 	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
2876 		timer_active	= 2;
2877 		timer_expires	= icsk->icsk_keepalive_timer.expires;
2878 	} else {
2879 		timer_active	= 0;
2880 		timer_expires = jiffies;
2881 	}
2882 
2883 	state = inet_sk_state_load(sk);
2884 	if (state == TCP_LISTEN)
2885 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2886 	else
2887 		/* Because we don't lock the socket,
2888 		 * we might find a transient negative value.
2889 		 */
2890 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2891 				      READ_ONCE(tp->copied_seq), 0);
2892 
2893 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2894 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2895 		i, src, srcp, dest, destp, state,
2896 		READ_ONCE(tp->write_seq) - tp->snd_una,
2897 		rx_queue,
2898 		timer_active,
2899 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2900 		READ_ONCE(icsk->icsk_retransmits),
2901 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2902 		READ_ONCE(icsk->icsk_probes_out),
2903 		sock_i_ino(sk),
2904 		refcount_read(&sk->sk_refcnt), sk,
2905 		jiffies_to_clock_t(icsk->icsk_rto),
2906 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2907 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2908 		tcp_snd_cwnd(tp),
2909 		state == TCP_LISTEN ?
2910 		    fastopenq->max_qlen :
2911 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2912 }
2913 
2914 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2915 			       struct seq_file *f, int i)
2916 {
2917 	long delta = tw->tw_timer.expires - jiffies;
2918 	__be32 dest, src;
2919 	__u16 destp, srcp;
2920 
2921 	dest  = tw->tw_daddr;
2922 	src   = tw->tw_rcv_saddr;
2923 	destp = ntohs(tw->tw_dport);
2924 	srcp  = ntohs(tw->tw_sport);
2925 
2926 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2927 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2928 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2929 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2930 		refcount_read(&tw->tw_refcnt), tw);
2931 }
2932 
2933 #define TMPSZ 150
2934 
2935 static int tcp4_seq_show(struct seq_file *seq, void *v)
2936 {
2937 	struct tcp_iter_state *st;
2938 	struct sock *sk = v;
2939 
2940 	seq_setwidth(seq, TMPSZ - 1);
2941 	if (v == SEQ_START_TOKEN) {
2942 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2943 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2944 			   "inode");
2945 		goto out;
2946 	}
2947 	st = seq->private;
2948 
2949 	if (sk->sk_state == TCP_TIME_WAIT)
2950 		get_timewait4_sock(v, seq, st->num);
2951 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2952 		get_openreq4(v, seq, st->num);
2953 	else
2954 		get_tcp4_sock(v, seq, st->num);
2955 out:
2956 	seq_pad(seq, '\n');
2957 	return 0;
2958 }
2959 
2960 #ifdef CONFIG_BPF_SYSCALL
2961 union bpf_tcp_iter_batch_item {
2962 	struct sock *sk;
2963 	__u64 cookie;
2964 };
2965 
2966 struct bpf_tcp_iter_state {
2967 	struct tcp_iter_state state;
2968 	unsigned int cur_sk;
2969 	unsigned int end_sk;
2970 	unsigned int max_sk;
2971 	union bpf_tcp_iter_batch_item *batch;
2972 };
2973 
2974 struct bpf_iter__tcp {
2975 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2976 	__bpf_md_ptr(struct sock_common *, sk_common);
2977 	uid_t uid __aligned(8);
2978 };
2979 
2980 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2981 			     struct sock_common *sk_common, uid_t uid)
2982 {
2983 	struct bpf_iter__tcp ctx;
2984 
2985 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2986 	ctx.meta = meta;
2987 	ctx.sk_common = sk_common;
2988 	ctx.uid = uid;
2989 	return bpf_iter_run_prog(prog, &ctx);
2990 }
2991 
2992 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2993 {
2994 	union bpf_tcp_iter_batch_item *item;
2995 	unsigned int cur_sk = iter->cur_sk;
2996 	__u64 cookie;
2997 
2998 	/* Remember the cookies of the sockets we haven't seen yet, so we can
2999 	 * pick up where we left off next time around.
3000 	 */
3001 	while (cur_sk < iter->end_sk) {
3002 		item = &iter->batch[cur_sk++];
3003 		cookie = sock_gen_cookie(item->sk);
3004 		sock_gen_put(item->sk);
3005 		item->cookie = cookie;
3006 	}
3007 }
3008 
3009 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3010 				      unsigned int new_batch_sz, gfp_t flags)
3011 {
3012 	union bpf_tcp_iter_batch_item *new_batch;
3013 
3014 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3015 			     flags | __GFP_NOWARN);
3016 	if (!new_batch)
3017 		return -ENOMEM;
3018 
3019 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3020 	kvfree(iter->batch);
3021 	iter->batch = new_batch;
3022 	iter->max_sk = new_batch_sz;
3023 
3024 	return 0;
3025 }
3026 
3027 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3028 					       union bpf_tcp_iter_batch_item *cookies,
3029 					       int n_cookies)
3030 {
3031 	struct hlist_nulls_node *node;
3032 	struct sock *sk;
3033 	int i;
3034 
3035 	for (i = 0; i < n_cookies; i++) {
3036 		sk = first_sk;
3037 		sk_nulls_for_each_from(sk, node)
3038 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3039 				return sk;
3040 	}
3041 
3042 	return NULL;
3043 }
3044 
3045 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3046 {
3047 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3048 	struct bpf_tcp_iter_state *iter = seq->private;
3049 	struct tcp_iter_state *st = &iter->state;
3050 	unsigned int find_cookie = iter->cur_sk;
3051 	unsigned int end_cookie = iter->end_sk;
3052 	int resume_bucket = st->bucket;
3053 	struct sock *sk;
3054 
3055 	if (end_cookie && find_cookie == end_cookie)
3056 		++st->bucket;
3057 
3058 	sk = listening_get_first(seq);
3059 	iter->cur_sk = 0;
3060 	iter->end_sk = 0;
3061 
3062 	if (sk && st->bucket == resume_bucket && end_cookie) {
3063 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3064 						end_cookie - find_cookie);
3065 		if (!sk) {
3066 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
3067 			++st->bucket;
3068 			sk = listening_get_first(seq);
3069 		}
3070 	}
3071 
3072 	return sk;
3073 }
3074 
3075 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3076 {
3077 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3078 	struct bpf_tcp_iter_state *iter = seq->private;
3079 	struct tcp_iter_state *st = &iter->state;
3080 	unsigned int find_cookie = iter->cur_sk;
3081 	unsigned int end_cookie = iter->end_sk;
3082 	int resume_bucket = st->bucket;
3083 	struct sock *sk;
3084 
3085 	if (end_cookie && find_cookie == end_cookie)
3086 		++st->bucket;
3087 
3088 	sk = established_get_first(seq);
3089 	iter->cur_sk = 0;
3090 	iter->end_sk = 0;
3091 
3092 	if (sk && st->bucket == resume_bucket && end_cookie) {
3093 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3094 						end_cookie - find_cookie);
3095 		if (!sk) {
3096 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3097 			++st->bucket;
3098 			sk = established_get_first(seq);
3099 		}
3100 	}
3101 
3102 	return sk;
3103 }
3104 
3105 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3106 {
3107 	struct bpf_tcp_iter_state *iter = seq->private;
3108 	struct tcp_iter_state *st = &iter->state;
3109 	struct sock *sk = NULL;
3110 
3111 	switch (st->state) {
3112 	case TCP_SEQ_STATE_LISTENING:
3113 		sk = bpf_iter_tcp_resume_listening(seq);
3114 		if (sk)
3115 			break;
3116 		st->bucket = 0;
3117 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3118 		fallthrough;
3119 	case TCP_SEQ_STATE_ESTABLISHED:
3120 		sk = bpf_iter_tcp_resume_established(seq);
3121 		break;
3122 	}
3123 
3124 	return sk;
3125 }
3126 
3127 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3128 						 struct sock **start_sk)
3129 {
3130 	struct bpf_tcp_iter_state *iter = seq->private;
3131 	struct hlist_nulls_node *node;
3132 	unsigned int expected = 1;
3133 	struct sock *sk;
3134 
3135 	sock_hold(*start_sk);
3136 	iter->batch[iter->end_sk++].sk = *start_sk;
3137 
3138 	sk = sk_nulls_next(*start_sk);
3139 	*start_sk = NULL;
3140 	sk_nulls_for_each_from(sk, node) {
3141 		if (seq_sk_match(seq, sk)) {
3142 			if (iter->end_sk < iter->max_sk) {
3143 				sock_hold(sk);
3144 				iter->batch[iter->end_sk++].sk = sk;
3145 			} else if (!*start_sk) {
3146 				/* Remember where we left off. */
3147 				*start_sk = sk;
3148 			}
3149 			expected++;
3150 		}
3151 	}
3152 
3153 	return expected;
3154 }
3155 
3156 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3157 						   struct sock **start_sk)
3158 {
3159 	struct bpf_tcp_iter_state *iter = seq->private;
3160 	struct hlist_nulls_node *node;
3161 	unsigned int expected = 1;
3162 	struct sock *sk;
3163 
3164 	sock_hold(*start_sk);
3165 	iter->batch[iter->end_sk++].sk = *start_sk;
3166 
3167 	sk = sk_nulls_next(*start_sk);
3168 	*start_sk = NULL;
3169 	sk_nulls_for_each_from(sk, node) {
3170 		if (seq_sk_match(seq, sk)) {
3171 			if (iter->end_sk < iter->max_sk) {
3172 				sock_hold(sk);
3173 				iter->batch[iter->end_sk++].sk = sk;
3174 			} else if (!*start_sk) {
3175 				/* Remember where we left off. */
3176 				*start_sk = sk;
3177 			}
3178 			expected++;
3179 		}
3180 	}
3181 
3182 	return expected;
3183 }
3184 
3185 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3186 					struct sock **start_sk)
3187 {
3188 	struct bpf_tcp_iter_state *iter = seq->private;
3189 	struct tcp_iter_state *st = &iter->state;
3190 
3191 	if (st->state == TCP_SEQ_STATE_LISTENING)
3192 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3193 	else
3194 		return bpf_iter_tcp_established_batch(seq, start_sk);
3195 }
3196 
3197 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3198 {
3199 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3200 	struct bpf_tcp_iter_state *iter = seq->private;
3201 	struct tcp_iter_state *st = &iter->state;
3202 
3203 	if (st->state == TCP_SEQ_STATE_LISTENING)
3204 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3205 	else
3206 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3207 }
3208 
3209 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3210 {
3211 	struct bpf_tcp_iter_state *iter = seq->private;
3212 	unsigned int expected;
3213 	struct sock *sk;
3214 	int err;
3215 
3216 	sk = bpf_iter_tcp_resume(seq);
3217 	if (!sk)
3218 		return NULL; /* Done */
3219 
3220 	expected = bpf_iter_fill_batch(seq, &sk);
3221 	if (likely(iter->end_sk == expected))
3222 		goto done;
3223 
3224 	/* Batch size was too small. */
3225 	bpf_iter_tcp_unlock_bucket(seq);
3226 	bpf_iter_tcp_put_batch(iter);
3227 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3228 					 GFP_USER);
3229 	if (err)
3230 		return ERR_PTR(err);
3231 
3232 	sk = bpf_iter_tcp_resume(seq);
3233 	if (!sk)
3234 		return NULL; /* Done */
3235 
3236 	expected = bpf_iter_fill_batch(seq, &sk);
3237 	if (likely(iter->end_sk == expected))
3238 		goto done;
3239 
3240 	/* Batch size was still too small. Hold onto the lock while we try
3241 	 * again with a larger batch to make sure the current bucket's size
3242 	 * does not change in the meantime.
3243 	 */
3244 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3245 	if (err) {
3246 		bpf_iter_tcp_unlock_bucket(seq);
3247 		return ERR_PTR(err);
3248 	}
3249 
3250 	expected = bpf_iter_fill_batch(seq, &sk);
3251 	WARN_ON_ONCE(iter->end_sk != expected);
3252 done:
3253 	bpf_iter_tcp_unlock_bucket(seq);
3254 	return iter->batch[0].sk;
3255 }
3256 
3257 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3258 {
3259 	/* bpf iter does not support lseek, so it always
3260 	 * continue from where it was stop()-ped.
3261 	 */
3262 	if (*pos)
3263 		return bpf_iter_tcp_batch(seq);
3264 
3265 	return SEQ_START_TOKEN;
3266 }
3267 
3268 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3269 {
3270 	struct bpf_tcp_iter_state *iter = seq->private;
3271 	struct tcp_iter_state *st = &iter->state;
3272 	struct sock *sk;
3273 
3274 	/* Whenever seq_next() is called, the iter->cur_sk is
3275 	 * done with seq_show(), so advance to the next sk in
3276 	 * the batch.
3277 	 */
3278 	if (iter->cur_sk < iter->end_sk) {
3279 		/* Keeping st->num consistent in tcp_iter_state.
3280 		 * bpf_iter_tcp does not use st->num.
3281 		 * meta.seq_num is used instead.
3282 		 */
3283 		st->num++;
3284 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3285 	}
3286 
3287 	if (iter->cur_sk < iter->end_sk)
3288 		sk = iter->batch[iter->cur_sk].sk;
3289 	else
3290 		sk = bpf_iter_tcp_batch(seq);
3291 
3292 	++*pos;
3293 	/* Keeping st->last_pos consistent in tcp_iter_state.
3294 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3295 	 */
3296 	st->last_pos = *pos;
3297 	return sk;
3298 }
3299 
3300 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3301 {
3302 	struct bpf_iter_meta meta;
3303 	struct bpf_prog *prog;
3304 	struct sock *sk = v;
3305 	uid_t uid;
3306 	int ret;
3307 
3308 	if (v == SEQ_START_TOKEN)
3309 		return 0;
3310 
3311 	if (sk_fullsock(sk))
3312 		lock_sock(sk);
3313 
3314 	if (unlikely(sk_unhashed(sk))) {
3315 		ret = SEQ_SKIP;
3316 		goto unlock;
3317 	}
3318 
3319 	if (sk->sk_state == TCP_TIME_WAIT) {
3320 		uid = 0;
3321 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3322 		const struct request_sock *req = v;
3323 
3324 		uid = from_kuid_munged(seq_user_ns(seq),
3325 				       sk_uid(req->rsk_listener));
3326 	} else {
3327 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3328 	}
3329 
3330 	meta.seq = seq;
3331 	prog = bpf_iter_get_info(&meta, false);
3332 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3333 
3334 unlock:
3335 	if (sk_fullsock(sk))
3336 		release_sock(sk);
3337 	return ret;
3338 
3339 }
3340 
3341 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3342 {
3343 	struct bpf_tcp_iter_state *iter = seq->private;
3344 	struct bpf_iter_meta meta;
3345 	struct bpf_prog *prog;
3346 
3347 	if (!v) {
3348 		meta.seq = seq;
3349 		prog = bpf_iter_get_info(&meta, true);
3350 		if (prog)
3351 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3352 	}
3353 
3354 	if (iter->cur_sk < iter->end_sk)
3355 		bpf_iter_tcp_put_batch(iter);
3356 }
3357 
3358 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3359 	.show		= bpf_iter_tcp_seq_show,
3360 	.start		= bpf_iter_tcp_seq_start,
3361 	.next		= bpf_iter_tcp_seq_next,
3362 	.stop		= bpf_iter_tcp_seq_stop,
3363 };
3364 #endif
3365 static unsigned short seq_file_family(const struct seq_file *seq)
3366 {
3367 	const struct tcp_seq_afinfo *afinfo;
3368 
3369 #ifdef CONFIG_BPF_SYSCALL
3370 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3371 	if (seq->op == &bpf_iter_tcp_seq_ops)
3372 		return AF_UNSPEC;
3373 #endif
3374 
3375 	/* Iterated from proc fs */
3376 	afinfo = pde_data(file_inode(seq->file));
3377 	return afinfo->family;
3378 }
3379 
3380 static const struct seq_operations tcp4_seq_ops = {
3381 	.show		= tcp4_seq_show,
3382 	.start		= tcp_seq_start,
3383 	.next		= tcp_seq_next,
3384 	.stop		= tcp_seq_stop,
3385 };
3386 
3387 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3388 	.family		= AF_INET,
3389 };
3390 
3391 static int __net_init tcp4_proc_init_net(struct net *net)
3392 {
3393 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3394 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3395 		return -ENOMEM;
3396 	return 0;
3397 }
3398 
3399 static void __net_exit tcp4_proc_exit_net(struct net *net)
3400 {
3401 	remove_proc_entry("tcp", net->proc_net);
3402 }
3403 
3404 static struct pernet_operations tcp4_net_ops = {
3405 	.init = tcp4_proc_init_net,
3406 	.exit = tcp4_proc_exit_net,
3407 };
3408 
3409 int __init tcp4_proc_init(void)
3410 {
3411 	return register_pernet_subsys(&tcp4_net_ops);
3412 }
3413 
3414 void tcp4_proc_exit(void)
3415 {
3416 	unregister_pernet_subsys(&tcp4_net_ops);
3417 }
3418 #endif /* CONFIG_PROC_FS */
3419 
3420 struct proto tcp_prot = {
3421 	.name			= "TCP",
3422 	.owner			= THIS_MODULE,
3423 	.close			= tcp_close,
3424 	.pre_connect		= tcp_v4_pre_connect,
3425 	.connect		= tcp_v4_connect,
3426 	.disconnect		= tcp_disconnect,
3427 	.accept			= inet_csk_accept,
3428 	.ioctl			= tcp_ioctl,
3429 	.init			= tcp_v4_init_sock,
3430 	.destroy		= tcp_v4_destroy_sock,
3431 	.shutdown		= tcp_shutdown,
3432 	.setsockopt		= tcp_setsockopt,
3433 	.getsockopt		= tcp_getsockopt,
3434 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3435 	.keepalive		= tcp_set_keepalive,
3436 	.recvmsg		= tcp_recvmsg,
3437 	.sendmsg		= tcp_sendmsg,
3438 	.splice_eof		= tcp_splice_eof,
3439 	.backlog_rcv		= tcp_v4_do_rcv,
3440 	.release_cb		= tcp_release_cb,
3441 	.hash			= inet_hash,
3442 	.unhash			= inet_unhash,
3443 	.get_port		= inet_csk_get_port,
3444 	.put_port		= inet_put_port,
3445 #ifdef CONFIG_BPF_SYSCALL
3446 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3447 #endif
3448 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3449 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3450 	.stream_memory_free	= tcp_stream_memory_free,
3451 	.sockets_allocated	= &tcp_sockets_allocated,
3452 
3453 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3454 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3455 
3456 	.memory_pressure	= &tcp_memory_pressure,
3457 	.sysctl_mem		= sysctl_tcp_mem,
3458 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3459 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3460 	.max_header		= MAX_TCP_HEADER,
3461 	.obj_size		= sizeof(struct tcp_sock),
3462 	.freeptr_offset		= offsetof(struct tcp_sock,
3463 					   inet_conn.icsk_inet.sk.sk_freeptr),
3464 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3465 	.twsk_prot		= &tcp_timewait_sock_ops,
3466 	.rsk_prot		= &tcp_request_sock_ops,
3467 	.h.hashinfo		= NULL,
3468 	.no_autobind		= true,
3469 	.diag_destroy		= tcp_abort,
3470 };
3471 EXPORT_SYMBOL(tcp_prot);
3472 
3473 static void __net_exit tcp_sk_exit(struct net *net)
3474 {
3475 	if (net->ipv4.tcp_congestion_control)
3476 		bpf_module_put(net->ipv4.tcp_congestion_control,
3477 			       net->ipv4.tcp_congestion_control->owner);
3478 }
3479 
3480 static void __net_init tcp_set_hashinfo(struct net *net)
3481 {
3482 	struct inet_hashinfo *hinfo;
3483 	unsigned int ehash_entries;
3484 	struct net *old_net;
3485 
3486 	if (net_eq(net, &init_net))
3487 		goto fallback;
3488 
3489 	old_net = current->nsproxy->net_ns;
3490 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3491 	if (!ehash_entries)
3492 		goto fallback;
3493 
3494 	ehash_entries = roundup_pow_of_two(ehash_entries);
3495 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3496 	if (!hinfo) {
3497 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3498 			"for a netns, fallback to the global one\n",
3499 			ehash_entries);
3500 fallback:
3501 		hinfo = &tcp_hashinfo;
3502 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3503 	}
3504 
3505 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3506 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3507 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3508 }
3509 
3510 static int __net_init tcp_sk_init(struct net *net)
3511 {
3512 	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3513 	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3514 	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3515 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3516 
3517 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3518 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3519 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3520 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3521 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3522 
3523 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3524 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3525 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3526 
3527 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3528 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3529 	net->ipv4.sysctl_tcp_syncookies = 1;
3530 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3531 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3532 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3533 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3534 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3535 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3536 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3537 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3538 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3539 
3540 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3541 	tcp_set_hashinfo(net);
3542 
3543 	net->ipv4.sysctl_tcp_sack = 1;
3544 	net->ipv4.sysctl_tcp_window_scaling = 1;
3545 	net->ipv4.sysctl_tcp_timestamps = 1;
3546 	net->ipv4.sysctl_tcp_early_retrans = 3;
3547 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3548 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3549 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3550 	net->ipv4.sysctl_tcp_max_reordering = 300;
3551 	net->ipv4.sysctl_tcp_dsack = 1;
3552 	net->ipv4.sysctl_tcp_app_win = 31;
3553 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3554 	net->ipv4.sysctl_tcp_frto = 2;
3555 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3556 	net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3557 	/* This limits the percentage of the congestion window which we
3558 	 * will allow a single TSO frame to consume.  Building TSO frames
3559 	 * which are too large can cause TCP streams to be bursty.
3560 	 */
3561 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3562 	/* Default TSQ limit of 4 MB */
3563 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3564 
3565 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3566 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3567 
3568 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3569 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3570 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3571 	net->ipv4.sysctl_tcp_autocorking = 1;
3572 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3573 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3574 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3575 	if (net != &init_net) {
3576 		memcpy(net->ipv4.sysctl_tcp_rmem,
3577 		       init_net.ipv4.sysctl_tcp_rmem,
3578 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3579 		memcpy(net->ipv4.sysctl_tcp_wmem,
3580 		       init_net.ipv4.sysctl_tcp_wmem,
3581 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3582 	}
3583 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3584 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
3585 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3586 	net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
3587 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3588 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3589 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3590 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3591 
3592 	/* Set default values for PLB */
3593 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3594 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3595 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3596 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3597 	/* Default congestion threshold for PLB to mark a round is 50% */
3598 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3599 
3600 	/* Reno is always built in */
3601 	if (!net_eq(net, &init_net) &&
3602 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3603 			       init_net.ipv4.tcp_congestion_control->owner))
3604 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3605 	else
3606 		net->ipv4.tcp_congestion_control = &tcp_reno;
3607 
3608 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3609 	net->ipv4.sysctl_tcp_shrink_window = 0;
3610 
3611 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3612 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3613 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3614 
3615 	return 0;
3616 }
3617 
3618 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3619 {
3620 	struct net *net;
3621 
3622 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3623 	 * and failed setup_net error unwinding path are serialized.
3624 	 *
3625 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3626 	 * net_exit_list, the thread that dismantles a particular twsk must
3627 	 * do so without other thread progressing to refcount_dec_and_test() of
3628 	 * tcp_death_row.tw_refcount.
3629 	 */
3630 	mutex_lock(&tcp_exit_batch_mutex);
3631 
3632 	tcp_twsk_purge(net_exit_list);
3633 
3634 	list_for_each_entry(net, net_exit_list, exit_list) {
3635 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3636 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3637 		tcp_fastopen_ctx_destroy(net);
3638 	}
3639 
3640 	mutex_unlock(&tcp_exit_batch_mutex);
3641 }
3642 
3643 static struct pernet_operations __net_initdata tcp_sk_ops = {
3644        .init	   = tcp_sk_init,
3645        .exit	   = tcp_sk_exit,
3646        .exit_batch = tcp_sk_exit_batch,
3647 };
3648 
3649 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3650 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3651 		     struct sock_common *sk_common, uid_t uid)
3652 
3653 #define INIT_BATCH_SZ 16
3654 
3655 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3656 {
3657 	struct bpf_tcp_iter_state *iter = priv_data;
3658 	int err;
3659 
3660 	err = bpf_iter_init_seq_net(priv_data, aux);
3661 	if (err)
3662 		return err;
3663 
3664 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3665 	if (err) {
3666 		bpf_iter_fini_seq_net(priv_data);
3667 		return err;
3668 	}
3669 
3670 	return 0;
3671 }
3672 
3673 static void bpf_iter_fini_tcp(void *priv_data)
3674 {
3675 	struct bpf_tcp_iter_state *iter = priv_data;
3676 
3677 	bpf_iter_fini_seq_net(priv_data);
3678 	kvfree(iter->batch);
3679 }
3680 
3681 static const struct bpf_iter_seq_info tcp_seq_info = {
3682 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3683 	.init_seq_private	= bpf_iter_init_tcp,
3684 	.fini_seq_private	= bpf_iter_fini_tcp,
3685 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3686 };
3687 
3688 static const struct bpf_func_proto *
3689 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3690 			    const struct bpf_prog *prog)
3691 {
3692 	switch (func_id) {
3693 	case BPF_FUNC_setsockopt:
3694 		return &bpf_sk_setsockopt_proto;
3695 	case BPF_FUNC_getsockopt:
3696 		return &bpf_sk_getsockopt_proto;
3697 	default:
3698 		return NULL;
3699 	}
3700 }
3701 
3702 static struct bpf_iter_reg tcp_reg_info = {
3703 	.target			= "tcp",
3704 	.ctx_arg_info_size	= 1,
3705 	.ctx_arg_info		= {
3706 		{ offsetof(struct bpf_iter__tcp, sk_common),
3707 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3708 	},
3709 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3710 	.seq_info		= &tcp_seq_info,
3711 };
3712 
3713 static void __init bpf_iter_register(void)
3714 {
3715 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3716 	if (bpf_iter_reg_target(&tcp_reg_info))
3717 		pr_warn("Warning: could not register bpf iterator tcp\n");
3718 }
3719 
3720 #endif
3721 
3722 void __init tcp_v4_init(void)
3723 {
3724 	int cpu, res;
3725 
3726 	for_each_possible_cpu(cpu) {
3727 		struct sock *sk;
3728 
3729 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3730 					   IPPROTO_TCP, &init_net);
3731 		if (res)
3732 			panic("Failed to create the TCP control socket.\n");
3733 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3734 
3735 		/* Please enforce IP_DF and IPID==0 for RST and
3736 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3737 		 */
3738 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3739 
3740 		sk->sk_clockid = CLOCK_MONOTONIC;
3741 
3742 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3743 	}
3744 	if (register_pernet_subsys(&tcp_sk_ops))
3745 		panic("Failed to create the TCP control socket.\n");
3746 
3747 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3748 	bpf_iter_register();
3749 #endif
3750 }
3751