xref: /linux/net/ipv4/tcp_ipv4.c (revision 2ed4b46b4fc77749cb0f8dd31a01441b82c8dbaa)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/fips.h>
57 #include <linux/jhash.h>
58 #include <linux/init.h>
59 #include <linux/times.h>
60 #include <linux/slab.h>
61 #include <linux/sched.h>
62 #include <linux/sock_diag.h>
63 
64 #include <net/aligned_data.h>
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/tcp_ecn.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/inet_ecn.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/busy_poll.h>
78 #include <net/rstreason.h>
79 #include <net/psp.h>
80 
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86 #include <linux/inetdevice.h>
87 #include <linux/btf_ids.h>
88 #include <linux/skbuff_ref.h>
89 
90 #include <crypto/md5.h>
91 #include <crypto/utils.h>
92 
93 #include <trace/events/tcp.h>
94 
95 #ifdef CONFIG_TCP_MD5SIG
96 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
97 				__be32 daddr, __be32 saddr, const struct tcphdr *th);
98 #endif
99 
100 struct inet_hashinfo tcp_hashinfo;
101 
102 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
103 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
104 };
105 
106 static DEFINE_MUTEX(tcp_exit_batch_mutex);
107 
108 static union tcp_seq_and_ts_off
109 tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
110 {
111 	return secure_tcp_seq_and_ts_off(net,
112 					 ip_hdr(skb)->daddr,
113 					 ip_hdr(skb)->saddr,
114 					 tcp_hdr(skb)->dest,
115 					 tcp_hdr(skb)->source);
116 }
117 
118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
119 {
120 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
121 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
122 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
123 	struct tcp_sock *tp = tcp_sk(sk);
124 	int ts_recent_stamp;
125 	u32 reuse_thresh;
126 
127 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
128 		reuse = 0;
129 
130 	if (reuse == 2) {
131 		/* Still does not detect *everything* that goes through
132 		 * lo, since we require a loopback src or dst address
133 		 * or direct binding to 'lo' interface.
134 		 */
135 		bool loopback = false;
136 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
137 			loopback = true;
138 #if IS_ENABLED(CONFIG_IPV6)
139 		if (tw->tw_family == AF_INET6) {
140 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
141 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
142 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
143 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
144 				loopback = true;
145 		} else
146 #endif
147 		{
148 			if (ipv4_is_loopback(tw->tw_daddr) ||
149 			    ipv4_is_loopback(tw->tw_rcv_saddr))
150 				loopback = true;
151 		}
152 		if (!loopback)
153 			reuse = 0;
154 	}
155 
156 	/* With PAWS, it is safe from the viewpoint
157 	   of data integrity. Even without PAWS it is safe provided sequence
158 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
159 
160 	   Actually, the idea is close to VJ's one, only timestamp cache is
161 	   held not per host, but per port pair and TW bucket is used as state
162 	   holder.
163 
164 	   If TW bucket has been already destroyed we fall back to VJ's scheme
165 	   and use initial timestamp retrieved from peer table.
166 	 */
167 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
168 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
169 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
170 	if (ts_recent_stamp &&
171 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
172 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
173 		 * and releasing the bucket lock.
174 		 */
175 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
176 			return 0;
177 
178 		/* In case of repair and re-using TIME-WAIT sockets we still
179 		 * want to be sure that it is safe as above but honor the
180 		 * sequence numbers and time stamps set as part of the repair
181 		 * process.
182 		 *
183 		 * Without this check re-using a TIME-WAIT socket with TCP
184 		 * repair would accumulate a -1 on the repair assigned
185 		 * sequence number. The first time it is reused the sequence
186 		 * is -1, the second time -2, etc. This fixes that issue
187 		 * without appearing to create any others.
188 		 */
189 		if (likely(!tp->repair)) {
190 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
191 
192 			if (!seq)
193 				seq = 1;
194 			WRITE_ONCE(tp->write_seq, seq);
195 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
196 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
197 		}
198 
199 		return 1;
200 	}
201 
202 	return 0;
203 }
204 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
205 
206 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
207 			      int addr_len)
208 {
209 	/* This check is replicated from tcp_v4_connect() and intended to
210 	 * prevent BPF program called below from accessing bytes that are out
211 	 * of the bound specified by user in addr_len.
212 	 */
213 	if (addr_len < sizeof(struct sockaddr_in))
214 		return -EINVAL;
215 
216 	sock_owned_by_me(sk);
217 
218 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
219 }
220 
221 /* This will initiate an outgoing connection. */
222 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
223 {
224 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
225 	struct inet_timewait_death_row *tcp_death_row;
226 	struct inet_sock *inet = inet_sk(sk);
227 	struct tcp_sock *tp = tcp_sk(sk);
228 	struct ip_options_rcu *inet_opt;
229 	struct net *net = sock_net(sk);
230 	__be16 orig_sport, orig_dport;
231 	__be32 daddr, nexthop;
232 	struct flowi4 *fl4;
233 	struct rtable *rt;
234 	int err;
235 
236 	if (addr_len < sizeof(struct sockaddr_in))
237 		return -EINVAL;
238 
239 	if (usin->sin_family != AF_INET)
240 		return -EAFNOSUPPORT;
241 
242 	nexthop = daddr = usin->sin_addr.s_addr;
243 	inet_opt = rcu_dereference_protected(inet->inet_opt,
244 					     lockdep_sock_is_held(sk));
245 	if (inet_opt && inet_opt->opt.srr) {
246 		if (!daddr)
247 			return -EINVAL;
248 		nexthop = inet_opt->opt.faddr;
249 	}
250 
251 	orig_sport = inet->inet_sport;
252 	orig_dport = usin->sin_port;
253 	fl4 = &inet->cork.fl.u.ip4;
254 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
255 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
256 			      orig_dport, sk);
257 	if (IS_ERR(rt)) {
258 		err = PTR_ERR(rt);
259 		if (err == -ENETUNREACH)
260 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
261 		return err;
262 	}
263 
264 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
265 		ip_rt_put(rt);
266 		return -ENETUNREACH;
267 	}
268 
269 	if (!inet_opt || !inet_opt->opt.srr)
270 		daddr = fl4->daddr;
271 
272 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
273 
274 	if (!inet->inet_saddr) {
275 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
276 		if (err) {
277 			ip_rt_put(rt);
278 			return err;
279 		}
280 	} else {
281 		sk_rcv_saddr_set(sk, inet->inet_saddr);
282 	}
283 
284 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
285 		/* Reset inherited state */
286 		tp->rx_opt.ts_recent	   = 0;
287 		tp->rx_opt.ts_recent_stamp = 0;
288 		if (likely(!tp->repair))
289 			WRITE_ONCE(tp->write_seq, 0);
290 	}
291 
292 	inet->inet_dport = usin->sin_port;
293 	sk_daddr_set(sk, daddr);
294 
295 	inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
296 	if (inet_opt)
297 		inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
298 
299 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
300 
301 	/* Socket identity is still unknown (sport may be zero).
302 	 * However we set state to SYN-SENT and not releasing socket
303 	 * lock select source port, enter ourselves into the hash tables and
304 	 * complete initialization after this.
305 	 */
306 	tcp_set_state(sk, TCP_SYN_SENT);
307 	err = inet_hash_connect(tcp_death_row, sk);
308 	if (err)
309 		goto failure;
310 
311 	sk_set_txhash(sk);
312 
313 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
314 			       inet->inet_sport, inet->inet_dport, sk);
315 	if (IS_ERR(rt)) {
316 		err = PTR_ERR(rt);
317 		rt = NULL;
318 		goto failure;
319 	}
320 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
321 	/* OK, now commit destination to socket.  */
322 	sk->sk_gso_type = SKB_GSO_TCPV4;
323 	sk_setup_caps(sk, &rt->dst);
324 	rt = NULL;
325 
326 	if (likely(!tp->repair)) {
327 		union tcp_seq_and_ts_off st;
328 
329 		st = secure_tcp_seq_and_ts_off(net,
330 					       inet->inet_saddr,
331 					       inet->inet_daddr,
332 					       inet->inet_sport,
333 					       usin->sin_port);
334 		if (!tp->write_seq)
335 			WRITE_ONCE(tp->write_seq, st.seq);
336 		WRITE_ONCE(tp->tsoffset, st.ts_off);
337 	}
338 
339 	atomic_set(&inet->inet_id, get_random_u16());
340 
341 	if (tcp_fastopen_defer_connect(sk, &err))
342 		return err;
343 	if (err)
344 		goto failure;
345 
346 	err = tcp_connect(sk);
347 
348 	if (err)
349 		goto failure;
350 
351 	return 0;
352 
353 failure:
354 	/*
355 	 * This unhashes the socket and releases the local port,
356 	 * if necessary.
357 	 */
358 	tcp_set_state(sk, TCP_CLOSE);
359 	inet_bhash2_reset_saddr(sk);
360 	ip_rt_put(rt);
361 	sk->sk_route_caps = 0;
362 	inet->inet_dport = 0;
363 	return err;
364 }
365 EXPORT_IPV6_MOD(tcp_v4_connect);
366 
367 /*
368  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
369  * It can be called through tcp_release_cb() if socket was owned by user
370  * at the time tcp_v4_err() was called to handle ICMP message.
371  */
372 void tcp_v4_mtu_reduced(struct sock *sk)
373 {
374 	struct inet_sock *inet = inet_sk(sk);
375 	struct dst_entry *dst;
376 	u32 mtu, dmtu;
377 
378 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
379 		return;
380 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
381 	dst = inet_csk_update_pmtu(sk, mtu);
382 	if (!dst)
383 		return;
384 
385 	/* Something is about to be wrong... Remember soft error
386 	 * for the case, if this connection will not able to recover.
387 	 */
388 	dmtu = dst4_mtu(dst);
389 	if (mtu < dmtu && ip_dont_fragment(sk, dst))
390 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
391 
392 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
393 	    ip_sk_accept_pmtu(sk) &&
394 	    inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
395 		tcp_sync_mss(sk, dmtu);
396 
397 		/* Resend the TCP packet because it's
398 		 * clear that the old packet has been
399 		 * dropped. This is the new "fast" path mtu
400 		 * discovery.
401 		 */
402 		tcp_simple_retransmit(sk);
403 	} /* else let the usual retransmit timer handle it */
404 }
405 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
406 
407 static void do_redirect(struct sk_buff *skb, struct sock *sk)
408 {
409 	struct dst_entry *dst = __sk_dst_check(sk, 0);
410 
411 	if (dst)
412 		dst->ops->redirect(dst, sk, skb);
413 }
414 
415 
416 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
417 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
418 {
419 	struct request_sock *req = inet_reqsk(sk);
420 	struct net *net = sock_net(sk);
421 
422 	/* ICMPs are not backlogged, hence we cannot get
423 	 * an established socket here.
424 	 */
425 	if (seq != tcp_rsk(req)->snt_isn) {
426 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
427 	} else if (abort) {
428 		/*
429 		 * Still in SYN_RECV, just remove it silently.
430 		 * There is no good way to pass the error to the newly
431 		 * created socket, and POSIX does not want network
432 		 * errors returned from accept().
433 		 */
434 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
435 		tcp_listendrop(req->rsk_listener);
436 	}
437 	reqsk_put(req);
438 }
439 EXPORT_IPV6_MOD(tcp_req_err);
440 
441 /* TCP-LD (RFC 6069) logic */
442 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
443 {
444 	struct inet_connection_sock *icsk = inet_csk(sk);
445 	struct tcp_sock *tp = tcp_sk(sk);
446 	struct sk_buff *skb;
447 	s32 remaining;
448 	u32 delta_us;
449 
450 	if (sock_owned_by_user(sk))
451 		return;
452 
453 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
454 	    !icsk->icsk_backoff)
455 		return;
456 
457 	skb = tcp_rtx_queue_head(sk);
458 	if (WARN_ON_ONCE(!skb))
459 		return;
460 
461 	icsk->icsk_backoff--;
462 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
463 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
464 
465 	tcp_mstamp_refresh(tp);
466 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
467 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
468 
469 	if (remaining > 0) {
470 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
471 	} else {
472 		/* RTO revert clocked out retransmission.
473 		 * Will retransmit now.
474 		 */
475 		tcp_retransmit_timer(sk);
476 	}
477 }
478 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
479 
480 /*
481  * This routine is called by the ICMP module when it gets some
482  * sort of error condition.  If err < 0 then the socket should
483  * be closed and the error returned to the user.  If err > 0
484  * it's just the icmp type << 8 | icmp code.  After adjustment
485  * header points to the first 8 bytes of the tcp header.  We need
486  * to find the appropriate port.
487  *
488  * The locking strategy used here is very "optimistic". When
489  * someone else accesses the socket the ICMP is just dropped
490  * and for some paths there is no check at all.
491  * A more general error queue to queue errors for later handling
492  * is probably better.
493  *
494  */
495 
496 int tcp_v4_err(struct sk_buff *skb, u32 info)
497 {
498 	const struct iphdr *iph = (const struct iphdr *)skb->data;
499 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
500 	struct net *net = dev_net_rcu(skb->dev);
501 	const int type = icmp_hdr(skb)->type;
502 	const int code = icmp_hdr(skb)->code;
503 	struct request_sock *fastopen;
504 	struct tcp_sock *tp;
505 	u32 seq, snd_una;
506 	struct sock *sk;
507 	int err;
508 
509 	sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
510 				       ntohs(th->source), inet_iif(skb), 0);
511 	if (!sk) {
512 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
513 		return -ENOENT;
514 	}
515 	if (sk->sk_state == TCP_TIME_WAIT) {
516 		/* To increase the counter of ignored icmps for TCP-AO */
517 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
518 		inet_twsk_put(inet_twsk(sk));
519 		return 0;
520 	}
521 	seq = ntohl(th->seq);
522 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
523 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
524 				     type == ICMP_TIME_EXCEEDED ||
525 				     (type == ICMP_DEST_UNREACH &&
526 				      (code == ICMP_NET_UNREACH ||
527 				       code == ICMP_HOST_UNREACH)));
528 		return 0;
529 	}
530 
531 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
532 		sock_put(sk);
533 		return 0;
534 	}
535 
536 	bh_lock_sock(sk);
537 	/* If too many ICMPs get dropped on busy
538 	 * servers this needs to be solved differently.
539 	 * We do take care of PMTU discovery (RFC1191) special case :
540 	 * we can receive locally generated ICMP messages while socket is held.
541 	 */
542 	if (sock_owned_by_user(sk)) {
543 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
544 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
545 	}
546 	if (sk->sk_state == TCP_CLOSE)
547 		goto out;
548 
549 	if (static_branch_unlikely(&ip4_min_ttl)) {
550 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
551 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
552 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
553 			goto out;
554 		}
555 	}
556 
557 	tp = tcp_sk(sk);
558 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
559 	fastopen = rcu_dereference(tp->fastopen_rsk);
560 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
561 	if (sk->sk_state != TCP_LISTEN &&
562 	    !between(seq, snd_una, tp->snd_nxt)) {
563 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
564 		goto out;
565 	}
566 
567 	switch (type) {
568 	case ICMP_REDIRECT:
569 		if (!sock_owned_by_user(sk))
570 			do_redirect(skb, sk);
571 		goto out;
572 	case ICMP_SOURCE_QUENCH:
573 		/* Just silently ignore these. */
574 		goto out;
575 	case ICMP_PARAMETERPROB:
576 		err = EPROTO;
577 		break;
578 	case ICMP_DEST_UNREACH:
579 		if (code > NR_ICMP_UNREACH)
580 			goto out;
581 
582 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
583 			/* We are not interested in TCP_LISTEN and open_requests
584 			 * (SYN-ACKs send out by Linux are always <576bytes so
585 			 * they should go through unfragmented).
586 			 */
587 			if (sk->sk_state == TCP_LISTEN)
588 				goto out;
589 
590 			WRITE_ONCE(tp->mtu_info, info);
591 			if (!sock_owned_by_user(sk)) {
592 				tcp_v4_mtu_reduced(sk);
593 			} else {
594 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
595 					sock_hold(sk);
596 			}
597 			goto out;
598 		}
599 
600 		err = icmp_err_convert[code].errno;
601 		/* check if this ICMP message allows revert of backoff.
602 		 * (see RFC 6069)
603 		 */
604 		if (!fastopen &&
605 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
606 			tcp_ld_RTO_revert(sk, seq);
607 		break;
608 	case ICMP_TIME_EXCEEDED:
609 		err = EHOSTUNREACH;
610 		break;
611 	default:
612 		goto out;
613 	}
614 
615 	switch (sk->sk_state) {
616 	case TCP_SYN_SENT:
617 	case TCP_SYN_RECV:
618 		/* Only in fast or simultaneous open. If a fast open socket is
619 		 * already accepted it is treated as a connected one below.
620 		 */
621 		if (fastopen && !fastopen->sk)
622 			break;
623 
624 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
625 
626 		if (!sock_owned_by_user(sk))
627 			tcp_done_with_error(sk, err);
628 		else
629 			WRITE_ONCE(sk->sk_err_soft, err);
630 		goto out;
631 	}
632 
633 	/* If we've already connected we will keep trying
634 	 * until we time out, or the user gives up.
635 	 *
636 	 * rfc1122 4.2.3.9 allows to consider as hard errors
637 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
638 	 * but it is obsoleted by pmtu discovery).
639 	 *
640 	 * Note, that in modern internet, where routing is unreliable
641 	 * and in each dark corner broken firewalls sit, sending random
642 	 * errors ordered by their masters even this two messages finally lose
643 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
644 	 *
645 	 * Now we are in compliance with RFCs.
646 	 *							--ANK (980905)
647 	 */
648 
649 	if (!sock_owned_by_user(sk) &&
650 	    inet_test_bit(RECVERR, sk)) {
651 		WRITE_ONCE(sk->sk_err, err);
652 		sk_error_report(sk);
653 	} else	{ /* Only an error on timeout */
654 		WRITE_ONCE(sk->sk_err_soft, err);
655 	}
656 
657 out:
658 	bh_unlock_sock(sk);
659 	sock_put(sk);
660 	return 0;
661 }
662 
663 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
664 
665 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
666 				 const struct tcp_ao_hdr *aoh,
667 				 struct ip_reply_arg *arg, struct tcphdr *reply,
668 				 __be32 reply_options[REPLY_OPTIONS_LEN])
669 {
670 #ifdef CONFIG_TCP_AO
671 	int sdif = tcp_v4_sdif(skb);
672 	int dif = inet_iif(skb);
673 	int l3index = sdif ? dif : 0;
674 	bool allocated_traffic_key;
675 	struct tcp_ao_key *key;
676 	char *traffic_key;
677 	bool drop = true;
678 	u32 ao_sne = 0;
679 	u8 keyid;
680 
681 	rcu_read_lock();
682 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
683 				 &key, &traffic_key, &allocated_traffic_key,
684 				 &keyid, &ao_sne))
685 		goto out;
686 
687 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
688 				 (aoh->rnext_keyid << 8) | keyid);
689 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
690 	reply->doff = arg->iov[0].iov_len / 4;
691 
692 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
693 			    key, traffic_key,
694 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
695 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
696 			    reply, ao_sne))
697 		goto out;
698 	drop = false;
699 out:
700 	rcu_read_unlock();
701 	if (allocated_traffic_key)
702 		kfree(traffic_key);
703 	return drop;
704 #else
705 	return true;
706 #endif
707 }
708 
709 /*
710  *	This routine will send an RST to the other tcp.
711  *
712  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
713  *		      for reset.
714  *	Answer: if a packet caused RST, it is not for a socket
715  *		existing in our system, if it is matched to a socket,
716  *		it is just duplicate segment or bug in other side's TCP.
717  *		So that we build reply only basing on parameters
718  *		arrived with segment.
719  *	Exception: precedence violation. We do not implement it in any case.
720  */
721 
722 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
723 			      enum sk_rst_reason reason)
724 {
725 	const struct tcphdr *th = tcp_hdr(skb);
726 	struct {
727 		struct tcphdr th;
728 		__be32 opt[REPLY_OPTIONS_LEN];
729 	} rep;
730 	const __u8 *md5_hash_location = NULL;
731 	const struct tcp_ao_hdr *aoh;
732 	struct ip_reply_arg arg;
733 #ifdef CONFIG_TCP_MD5SIG
734 	struct tcp_md5sig_key *key = NULL;
735 	unsigned char newhash[16];
736 	struct sock *sk1 = NULL;
737 #endif
738 	u64 transmit_time = 0;
739 	struct sock *ctl_sk;
740 	struct net *net;
741 	u32 txhash = 0;
742 
743 	/* Never send a reset in response to a reset. */
744 	if (th->rst)
745 		return;
746 
747 	/* If sk not NULL, it means we did a successful lookup and incoming
748 	 * route had to be correct. prequeue might have dropped our dst.
749 	 */
750 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
751 		return;
752 
753 	/* Swap the send and the receive. */
754 	memset(&rep, 0, sizeof(rep));
755 	rep.th.dest   = th->source;
756 	rep.th.source = th->dest;
757 	rep.th.doff   = sizeof(struct tcphdr) / 4;
758 	rep.th.rst    = 1;
759 
760 	if (th->ack) {
761 		rep.th.seq = th->ack_seq;
762 	} else {
763 		rep.th.ack = 1;
764 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
765 				       skb->len - (th->doff << 2));
766 	}
767 
768 	memset(&arg, 0, sizeof(arg));
769 	arg.iov[0].iov_base = (unsigned char *)&rep;
770 	arg.iov[0].iov_len  = sizeof(rep.th);
771 
772 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
773 
774 	/* Invalid TCP option size or twice included auth */
775 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
776 		return;
777 
778 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
779 		return;
780 
781 #ifdef CONFIG_TCP_MD5SIG
782 	rcu_read_lock();
783 	if (sk && sk_fullsock(sk)) {
784 		const union tcp_md5_addr *addr;
785 		int l3index;
786 
787 		/* sdif set, means packet ingressed via a device
788 		 * in an L3 domain and inet_iif is set to it.
789 		 */
790 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
791 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
792 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
793 	} else if (md5_hash_location) {
794 		const union tcp_md5_addr *addr;
795 		int sdif = tcp_v4_sdif(skb);
796 		int dif = inet_iif(skb);
797 		int l3index;
798 
799 		/*
800 		 * active side is lost. Try to find listening socket through
801 		 * source port, and then find md5 key through listening socket.
802 		 * we are not loose security here:
803 		 * Incoming packet is checked with md5 hash with finding key,
804 		 * no RST generated if md5 hash doesn't match.
805 		 */
806 		sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
807 					     th->source, ip_hdr(skb)->daddr,
808 					     ntohs(th->source), dif, sdif);
809 		/* don't send rst if it can't find key */
810 		if (!sk1)
811 			goto out;
812 
813 		/* sdif set, means packet ingressed via a device
814 		 * in an L3 domain and dif is set to it.
815 		 */
816 		l3index = sdif ? dif : 0;
817 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
818 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
819 		if (!key)
820 			goto out;
821 
822 		tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
823 		if (crypto_memneq(md5_hash_location, newhash, 16))
824 			goto out;
825 	}
826 
827 	if (key) {
828 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
829 				   (TCPOPT_NOP << 16) |
830 				   (TCPOPT_MD5SIG << 8) |
831 				   TCPOLEN_MD5SIG);
832 		/* Update length and the length the header thinks exists */
833 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
834 		rep.th.doff = arg.iov[0].iov_len / 4;
835 
836 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
837 				     key, ip_hdr(skb)->saddr,
838 				     ip_hdr(skb)->daddr, &rep.th);
839 	}
840 #endif
841 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
842 	if (rep.opt[0] == 0) {
843 		__be32 mrst = mptcp_reset_option(skb);
844 
845 		if (mrst) {
846 			rep.opt[0] = mrst;
847 			arg.iov[0].iov_len += sizeof(mrst);
848 			rep.th.doff = arg.iov[0].iov_len / 4;
849 		}
850 	}
851 
852 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
853 				      ip_hdr(skb)->saddr, /* XXX */
854 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
855 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
856 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
857 
858 	/* When socket is gone, all binding information is lost.
859 	 * routing might fail in this case. No choice here, if we choose to force
860 	 * input interface, we will misroute in case of asymmetric route.
861 	 */
862 	if (sk)
863 		arg.bound_dev_if = sk->sk_bound_dev_if;
864 
865 	trace_tcp_send_reset(sk, skb, reason);
866 
867 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
868 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
869 
870 	/* ECN bits of TW reset are cleared */
871 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
872 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
873 	local_bh_disable();
874 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
875 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
876 
877 	sock_net_set(ctl_sk, net);
878 	if (sk) {
879 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
880 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
881 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
882 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
883 		transmit_time = tcp_transmit_time(sk);
884 		xfrm_sk_clone_policy(ctl_sk, sk);
885 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
886 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
887 	} else {
888 		ctl_sk->sk_mark = 0;
889 		ctl_sk->sk_priority = 0;
890 	}
891 	ip_send_unicast_reply(ctl_sk, sk,
892 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
893 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
894 			      &arg, arg.iov[0].iov_len,
895 			      transmit_time, txhash);
896 
897 	xfrm_sk_free_policy(ctl_sk);
898 	sock_net_set(ctl_sk, &init_net);
899 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
900 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
901 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
902 	local_bh_enable();
903 
904 #ifdef CONFIG_TCP_MD5SIG
905 out:
906 	rcu_read_unlock();
907 #endif
908 }
909 
910 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
911    outside socket context is ugly, certainly. What can I do?
912  */
913 
914 static void tcp_v4_send_ack(const struct sock *sk,
915 			    struct sk_buff *skb, u32 seq, u32 ack,
916 			    u32 win, u32 tsval, u32 tsecr, int oif,
917 			    struct tcp_key *key,
918 			    int reply_flags, u8 tos, u32 txhash)
919 {
920 	const struct tcphdr *th = tcp_hdr(skb);
921 	struct {
922 		struct tcphdr th;
923 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
924 	} rep;
925 	struct net *net = sock_net(sk);
926 	struct ip_reply_arg arg;
927 	struct sock *ctl_sk;
928 	u64 transmit_time;
929 
930 	memset(&rep.th, 0, sizeof(struct tcphdr));
931 	memset(&arg, 0, sizeof(arg));
932 
933 	arg.iov[0].iov_base = (unsigned char *)&rep;
934 	arg.iov[0].iov_len  = sizeof(rep.th);
935 	if (tsecr) {
936 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
937 				   (TCPOPT_TIMESTAMP << 8) |
938 				   TCPOLEN_TIMESTAMP);
939 		rep.opt[1] = htonl(tsval);
940 		rep.opt[2] = htonl(tsecr);
941 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
942 	}
943 
944 	/* Swap the send and the receive. */
945 	rep.th.dest    = th->source;
946 	rep.th.source  = th->dest;
947 	rep.th.doff    = arg.iov[0].iov_len / 4;
948 	rep.th.seq     = htonl(seq);
949 	rep.th.ack_seq = htonl(ack);
950 	rep.th.ack     = 1;
951 	rep.th.window  = htons(win);
952 
953 #ifdef CONFIG_TCP_MD5SIG
954 	if (tcp_key_is_md5(key)) {
955 		int offset = (tsecr) ? 3 : 0;
956 
957 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
958 					  (TCPOPT_NOP << 16) |
959 					  (TCPOPT_MD5SIG << 8) |
960 					  TCPOLEN_MD5SIG);
961 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
962 		rep.th.doff = arg.iov[0].iov_len/4;
963 
964 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
965 				    key->md5_key, ip_hdr(skb)->saddr,
966 				    ip_hdr(skb)->daddr, &rep.th);
967 	}
968 #endif
969 #ifdef CONFIG_TCP_AO
970 	if (tcp_key_is_ao(key)) {
971 		int offset = (tsecr) ? 3 : 0;
972 
973 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
974 					  (tcp_ao_len(key->ao_key) << 16) |
975 					  (key->ao_key->sndid << 8) |
976 					  key->rcv_next);
977 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
978 		rep.th.doff = arg.iov[0].iov_len / 4;
979 
980 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
981 				key->ao_key, key->traffic_key,
982 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
983 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
984 				&rep.th, key->sne);
985 	}
986 #endif
987 	arg.flags = reply_flags;
988 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
989 				      ip_hdr(skb)->saddr, /* XXX */
990 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
991 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
992 	if (oif)
993 		arg.bound_dev_if = oif;
994 	arg.tos = tos;
995 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
996 	local_bh_disable();
997 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
998 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
999 	sock_net_set(ctl_sk, net);
1000 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1001 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1002 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1003 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1004 	transmit_time = tcp_transmit_time(sk);
1005 	ip_send_unicast_reply(ctl_sk, sk,
1006 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1007 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1008 			      &arg, arg.iov[0].iov_len,
1009 			      transmit_time, txhash);
1010 
1011 	sock_net_set(ctl_sk, &init_net);
1012 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1013 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1014 	local_bh_enable();
1015 }
1016 
1017 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1018 				enum tcp_tw_status tw_status)
1019 {
1020 	struct inet_timewait_sock *tw = inet_twsk(sk);
1021 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1022 	struct tcp_key key = {};
1023 	u8 tos = tw->tw_tos;
1024 
1025 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1026 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1027 	 * being placed in a different service queues (Classic rather than L4S)
1028 	 */
1029 	if (tw_status == TCP_TW_ACK_OOW)
1030 		tos &= ~INET_ECN_MASK;
1031 
1032 #ifdef CONFIG_TCP_AO
1033 	struct tcp_ao_info *ao_info;
1034 
1035 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1036 		/* FIXME: the segment to-be-acked is not verified yet */
1037 		ao_info = rcu_dereference(tcptw->ao_info);
1038 		if (ao_info) {
1039 			const struct tcp_ao_hdr *aoh;
1040 
1041 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1042 				inet_twsk_put(tw);
1043 				return;
1044 			}
1045 
1046 			if (aoh)
1047 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1048 								    aoh->rnext_keyid, -1);
1049 		}
1050 	}
1051 	if (key.ao_key) {
1052 		struct tcp_ao_key *rnext_key;
1053 
1054 		key.traffic_key = snd_other_key(key.ao_key);
1055 		key.sne = READ_ONCE(ao_info->snd_sne);
1056 		rnext_key = READ_ONCE(ao_info->rnext_key);
1057 		key.rcv_next = rnext_key->rcvid;
1058 		key.type = TCP_KEY_AO;
1059 #else
1060 	if (0) {
1061 #endif
1062 	} else if (static_branch_tcp_md5()) {
1063 		key.md5_key = tcp_twsk_md5_key(tcptw);
1064 		if (key.md5_key)
1065 			key.type = TCP_KEY_MD5;
1066 	}
1067 
1068 	tcp_v4_send_ack(sk, skb,
1069 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1070 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1071 			tcp_tw_tsval(tcptw),
1072 			READ_ONCE(tcptw->tw_ts_recent),
1073 			tw->tw_bound_dev_if, &key,
1074 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1075 			tos,
1076 			tw->tw_txhash);
1077 
1078 	inet_twsk_put(tw);
1079 }
1080 
1081 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1082 				  struct request_sock *req)
1083 {
1084 	struct tcp_key key = {};
1085 
1086 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1087 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1088 	 */
1089 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1090 					     tcp_sk(sk)->snd_nxt;
1091 
1092 #ifdef CONFIG_TCP_AO
1093 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1094 	    tcp_rsk_used_ao(req)) {
1095 		const union tcp_md5_addr *addr;
1096 		const struct tcp_ao_hdr *aoh;
1097 		int l3index;
1098 
1099 		/* Invalid TCP option size or twice included auth */
1100 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1101 			return;
1102 		if (!aoh)
1103 			return;
1104 
1105 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1106 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1107 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1108 					      aoh->rnext_keyid, -1);
1109 		if (unlikely(!key.ao_key)) {
1110 			/* Send ACK with any matching MKT for the peer */
1111 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1112 			/* Matching key disappeared (user removed the key?)
1113 			 * let the handshake timeout.
1114 			 */
1115 			if (!key.ao_key) {
1116 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1117 						     addr,
1118 						     ntohs(tcp_hdr(skb)->source),
1119 						     &ip_hdr(skb)->daddr,
1120 						     ntohs(tcp_hdr(skb)->dest));
1121 				return;
1122 			}
1123 		}
1124 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1125 		if (!key.traffic_key)
1126 			return;
1127 
1128 		key.type = TCP_KEY_AO;
1129 		key.rcv_next = aoh->keyid;
1130 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1131 #else
1132 	if (0) {
1133 #endif
1134 	} else if (static_branch_tcp_md5()) {
1135 		const union tcp_md5_addr *addr;
1136 		int l3index;
1137 
1138 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1139 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1140 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1141 		if (key.md5_key)
1142 			key.type = TCP_KEY_MD5;
1143 	}
1144 
1145 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1146 	tcp_v4_send_ack(sk, skb, seq,
1147 			tcp_rsk(req)->rcv_nxt,
1148 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1149 			tcp_rsk_tsval(tcp_rsk(req)),
1150 			req->ts_recent,
1151 			0, &key,
1152 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1153 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1154 			READ_ONCE(tcp_rsk(req)->txhash));
1155 	if (tcp_key_is_ao(&key))
1156 		kfree(key.traffic_key);
1157 }
1158 
1159 /*
1160  *	Send a SYN-ACK after having received a SYN.
1161  *	This still operates on a request_sock only, not on a big
1162  *	socket.
1163  */
1164 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1165 			      struct flowi *fl,
1166 			      struct request_sock *req,
1167 			      struct tcp_fastopen_cookie *foc,
1168 			      enum tcp_synack_type synack_type,
1169 			      struct sk_buff *syn_skb)
1170 {
1171 	struct inet_request_sock *ireq = inet_rsk(req);
1172 	struct flowi4 fl4;
1173 	int err = -1;
1174 	struct sk_buff *skb;
1175 	u8 tos;
1176 
1177 	/* First, grab a route. */
1178 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1179 		return -1;
1180 
1181 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1182 
1183 	if (skb) {
1184 		tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1185 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1186 
1187 		tos = READ_ONCE(inet_sk(sk)->tos);
1188 
1189 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1190 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1191 			      (tos & INET_ECN_MASK);
1192 
1193 		if (!INET_ECN_is_capable(tos) &&
1194 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1195 			tos |= INET_ECN_ECT_0;
1196 
1197 		rcu_read_lock();
1198 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1199 					    ireq->ir_rmt_addr,
1200 					    rcu_dereference(ireq->ireq_opt),
1201 					    tos);
1202 		rcu_read_unlock();
1203 		err = net_xmit_eval(err);
1204 	}
1205 
1206 	return err;
1207 }
1208 
1209 /*
1210  *	IPv4 request_sock destructor.
1211  */
1212 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1213 {
1214 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1215 }
1216 
1217 #ifdef CONFIG_TCP_MD5SIG
1218 /*
1219  * RFC2385 MD5 checksumming requires a mapping of
1220  * IP address->MD5 Key.
1221  * We need to maintain these in the sk structure.
1222  */
1223 
1224 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1225 EXPORT_IPV6_MOD(tcp_md5_needed);
1226 
1227 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1228 {
1229 	if (!old)
1230 		return true;
1231 
1232 	/* l3index always overrides non-l3index */
1233 	if (old->l3index && new->l3index == 0)
1234 		return false;
1235 	if (old->l3index == 0 && new->l3index)
1236 		return true;
1237 
1238 	return old->prefixlen < new->prefixlen;
1239 }
1240 
1241 /* Find the Key structure for an address.  */
1242 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1243 					   const union tcp_md5_addr *addr,
1244 					   int family, bool any_l3index)
1245 {
1246 	const struct tcp_sock *tp = tcp_sk(sk);
1247 	struct tcp_md5sig_key *key;
1248 	const struct tcp_md5sig_info *md5sig;
1249 	__be32 mask;
1250 	struct tcp_md5sig_key *best_match = NULL;
1251 	bool match;
1252 
1253 	/* caller either holds rcu_read_lock() or socket lock */
1254 	md5sig = rcu_dereference_check(tp->md5sig_info,
1255 				       lockdep_sock_is_held(sk));
1256 	if (!md5sig)
1257 		return NULL;
1258 
1259 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1260 				 lockdep_sock_is_held(sk)) {
1261 		if (key->family != family)
1262 			continue;
1263 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1264 		    key->l3index != l3index)
1265 			continue;
1266 		if (family == AF_INET) {
1267 			mask = inet_make_mask(key->prefixlen);
1268 			match = (key->addr.a4.s_addr & mask) ==
1269 				(addr->a4.s_addr & mask);
1270 #if IS_ENABLED(CONFIG_IPV6)
1271 		} else if (family == AF_INET6) {
1272 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1273 						  key->prefixlen);
1274 #endif
1275 		} else {
1276 			match = false;
1277 		}
1278 
1279 		if (match && better_md5_match(best_match, key))
1280 			best_match = key;
1281 	}
1282 	return best_match;
1283 }
1284 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1285 
1286 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1287 						      const union tcp_md5_addr *addr,
1288 						      int family, u8 prefixlen,
1289 						      int l3index, u8 flags)
1290 {
1291 	const struct tcp_sock *tp = tcp_sk(sk);
1292 	struct tcp_md5sig_key *key;
1293 	unsigned int size = sizeof(struct in_addr);
1294 	const struct tcp_md5sig_info *md5sig;
1295 
1296 	/* caller either holds rcu_read_lock() or socket lock */
1297 	md5sig = rcu_dereference_check(tp->md5sig_info,
1298 				       lockdep_sock_is_held(sk));
1299 	if (!md5sig)
1300 		return NULL;
1301 #if IS_ENABLED(CONFIG_IPV6)
1302 	if (family == AF_INET6)
1303 		size = sizeof(struct in6_addr);
1304 #endif
1305 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1306 				 lockdep_sock_is_held(sk)) {
1307 		if (key->family != family)
1308 			continue;
1309 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1310 			continue;
1311 		if (key->l3index != l3index)
1312 			continue;
1313 		if (!memcmp(&key->addr, addr, size) &&
1314 		    key->prefixlen == prefixlen)
1315 			return key;
1316 	}
1317 	return NULL;
1318 }
1319 
1320 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1321 					 const struct sock *addr_sk)
1322 {
1323 	const union tcp_md5_addr *addr;
1324 	int l3index;
1325 
1326 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1327 						 addr_sk->sk_bound_dev_if);
1328 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1329 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1330 }
1331 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1332 
1333 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1334 {
1335 	struct tcp_sock *tp = tcp_sk(sk);
1336 	struct tcp_md5sig_info *md5sig;
1337 
1338 	md5sig = kmalloc_obj(*md5sig, gfp);
1339 	if (!md5sig)
1340 		return -ENOMEM;
1341 
1342 	sk_gso_disable(sk);
1343 	INIT_HLIST_HEAD(&md5sig->head);
1344 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1345 	return 0;
1346 }
1347 
1348 /* This can be called on a newly created socket, from other files */
1349 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1350 			    int family, u8 prefixlen, int l3index, u8 flags,
1351 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1352 {
1353 	/* Add Key to the list */
1354 	struct tcp_md5sig_key *key;
1355 	struct tcp_sock *tp = tcp_sk(sk);
1356 	struct tcp_md5sig_info *md5sig;
1357 
1358 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1359 	if (key) {
1360 		/* Pre-existing entry - just update that one.
1361 		 * Note that the key might be used concurrently.
1362 		 * data_race() is telling kcsan that we do not care of
1363 		 * key mismatches, since changing MD5 key on live flows
1364 		 * can lead to packet drops.
1365 		 */
1366 		data_race(memcpy(key->key, newkey, newkeylen));
1367 
1368 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1369 		 * Also note that a reader could catch new key->keylen value
1370 		 * but old key->key[], this is the reason we use __GFP_ZERO
1371 		 * at sock_kmalloc() time below these lines.
1372 		 */
1373 		WRITE_ONCE(key->keylen, newkeylen);
1374 
1375 		return 0;
1376 	}
1377 
1378 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1379 					   lockdep_sock_is_held(sk));
1380 
1381 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1382 	if (!key)
1383 		return -ENOMEM;
1384 
1385 	memcpy(key->key, newkey, newkeylen);
1386 	key->keylen = newkeylen;
1387 	key->family = family;
1388 	key->prefixlen = prefixlen;
1389 	key->l3index = l3index;
1390 	key->flags = flags;
1391 	memcpy(&key->addr, addr,
1392 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1393 								 sizeof(struct in_addr));
1394 	hlist_add_head_rcu(&key->node, &md5sig->head);
1395 	return 0;
1396 }
1397 
1398 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1399 		   int family, u8 prefixlen, int l3index, u8 flags,
1400 		   const u8 *newkey, u8 newkeylen)
1401 {
1402 	struct tcp_sock *tp = tcp_sk(sk);
1403 
1404 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1405 		if (fips_enabled) {
1406 			pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1407 			return -EOPNOTSUPP;
1408 		}
1409 
1410 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1411 			return -ENOMEM;
1412 
1413 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1414 			struct tcp_md5sig_info *md5sig;
1415 
1416 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1417 			rcu_assign_pointer(tp->md5sig_info, NULL);
1418 			kfree_rcu(md5sig, rcu);
1419 			return -EUSERS;
1420 		}
1421 	}
1422 
1423 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1424 				newkey, newkeylen, GFP_KERNEL);
1425 }
1426 EXPORT_IPV6_MOD(tcp_md5_do_add);
1427 
1428 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1429 		     int family, u8 prefixlen, int l3index,
1430 		     struct tcp_md5sig_key *key)
1431 {
1432 	struct tcp_sock *tp = tcp_sk(sk);
1433 
1434 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1435 
1436 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1437 			return -ENOMEM;
1438 
1439 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1440 			struct tcp_md5sig_info *md5sig;
1441 
1442 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1443 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1444 			rcu_assign_pointer(tp->md5sig_info, NULL);
1445 			kfree_rcu(md5sig, rcu);
1446 			return -EUSERS;
1447 		}
1448 	}
1449 
1450 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1451 				key->flags, key->key, key->keylen,
1452 				sk_gfp_mask(sk, GFP_ATOMIC));
1453 }
1454 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1455 
1456 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1457 		   u8 prefixlen, int l3index, u8 flags)
1458 {
1459 	struct tcp_md5sig_key *key;
1460 
1461 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1462 	if (!key)
1463 		return -ENOENT;
1464 	hlist_del_rcu(&key->node);
1465 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1466 	kfree_rcu(key, rcu);
1467 	return 0;
1468 }
1469 EXPORT_IPV6_MOD(tcp_md5_do_del);
1470 
1471 void tcp_clear_md5_list(struct sock *sk)
1472 {
1473 	struct tcp_sock *tp = tcp_sk(sk);
1474 	struct tcp_md5sig_key *key;
1475 	struct hlist_node *n;
1476 	struct tcp_md5sig_info *md5sig;
1477 
1478 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1479 
1480 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1481 		hlist_del(&key->node);
1482 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1483 		kfree(key);
1484 	}
1485 }
1486 
1487 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1488 				 sockptr_t optval, int optlen)
1489 {
1490 	struct tcp_md5sig cmd;
1491 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1492 	const union tcp_md5_addr *addr;
1493 	u8 prefixlen = 32;
1494 	int l3index = 0;
1495 	bool l3flag;
1496 	u8 flags;
1497 
1498 	if (optlen < sizeof(cmd))
1499 		return -EINVAL;
1500 
1501 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1502 		return -EFAULT;
1503 
1504 	if (sin->sin_family != AF_INET)
1505 		return -EINVAL;
1506 
1507 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1508 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1509 
1510 	if (optname == TCP_MD5SIG_EXT &&
1511 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1512 		prefixlen = cmd.tcpm_prefixlen;
1513 		if (prefixlen > 32)
1514 			return -EINVAL;
1515 	}
1516 
1517 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1518 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1519 		struct net_device *dev;
1520 
1521 		rcu_read_lock();
1522 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1523 		if (dev && netif_is_l3_master(dev))
1524 			l3index = dev->ifindex;
1525 
1526 		rcu_read_unlock();
1527 
1528 		/* ok to reference set/not set outside of rcu;
1529 		 * right now device MUST be an L3 master
1530 		 */
1531 		if (!dev || !l3index)
1532 			return -EINVAL;
1533 	}
1534 
1535 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1536 
1537 	if (!cmd.tcpm_keylen)
1538 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1539 
1540 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1541 		return -EINVAL;
1542 
1543 	/* Don't allow keys for peers that have a matching TCP-AO key.
1544 	 * See the comment in tcp_ao_add_cmd()
1545 	 */
1546 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1547 		return -EKEYREJECTED;
1548 
1549 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1550 			      cmd.tcpm_key, cmd.tcpm_keylen);
1551 }
1552 
1553 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1554 				    __be32 daddr, __be32 saddr,
1555 				    const struct tcphdr *th, int nbytes)
1556 {
1557 	struct {
1558 		struct tcp4_pseudohdr ip;
1559 		struct tcphdr tcp;
1560 	} h;
1561 
1562 	h.ip.saddr = saddr;
1563 	h.ip.daddr = daddr;
1564 	h.ip.pad = 0;
1565 	h.ip.protocol = IPPROTO_TCP;
1566 	h.ip.len = cpu_to_be16(nbytes);
1567 	h.tcp = *th;
1568 	h.tcp.check = 0;
1569 	md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
1570 }
1571 
1572 static noinline_for_stack void
1573 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1574 		    __be32 daddr, __be32 saddr, const struct tcphdr *th)
1575 {
1576 	struct md5_ctx ctx;
1577 
1578 	md5_init(&ctx);
1579 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
1580 	tcp_md5_hash_key(&ctx, key);
1581 	md5_final(&ctx, md5_hash);
1582 }
1583 
1584 noinline_for_stack void
1585 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1586 		    const struct sock *sk, const struct sk_buff *skb)
1587 {
1588 	const struct tcphdr *th = tcp_hdr(skb);
1589 	__be32 saddr, daddr;
1590 	struct md5_ctx ctx;
1591 
1592 	if (sk) { /* valid for establish/request sockets */
1593 		saddr = sk->sk_rcv_saddr;
1594 		daddr = sk->sk_daddr;
1595 	} else {
1596 		const struct iphdr *iph = ip_hdr(skb);
1597 		saddr = iph->saddr;
1598 		daddr = iph->daddr;
1599 	}
1600 
1601 	md5_init(&ctx);
1602 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
1603 	tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
1604 	tcp_md5_hash_key(&ctx, key);
1605 	md5_final(&ctx, md5_hash);
1606 }
1607 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1608 
1609 #endif
1610 
1611 static void tcp_v4_init_req(struct request_sock *req,
1612 			    const struct sock *sk_listener,
1613 			    struct sk_buff *skb)
1614 {
1615 	struct inet_request_sock *ireq = inet_rsk(req);
1616 	struct net *net = sock_net(sk_listener);
1617 
1618 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1619 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1620 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1621 }
1622 
1623 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1624 					  struct sk_buff *skb,
1625 					  struct flowi *fl,
1626 					  struct request_sock *req,
1627 					  u32 tw_isn)
1628 {
1629 	tcp_v4_init_req(req, sk, skb);
1630 
1631 	if (security_inet_conn_request(sk, skb, req))
1632 		return NULL;
1633 
1634 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1635 }
1636 
1637 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1638 	.family		=	PF_INET,
1639 	.obj_size	=	sizeof(struct tcp_request_sock),
1640 	.send_ack	=	tcp_v4_reqsk_send_ack,
1641 	.destructor	=	tcp_v4_reqsk_destructor,
1642 	.send_reset	=	tcp_v4_send_reset,
1643 };
1644 
1645 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1646 	.mss_clamp	=	TCP_MSS_DEFAULT,
1647 #ifdef CONFIG_TCP_MD5SIG
1648 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1649 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1650 #endif
1651 #ifdef CONFIG_TCP_AO
1652 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1653 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1654 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1655 #endif
1656 #ifdef CONFIG_SYN_COOKIES
1657 	.cookie_init_seq =	cookie_v4_init_sequence,
1658 #endif
1659 	.route_req	=	tcp_v4_route_req,
1660 	.init_seq_and_ts_off	=	tcp_v4_init_seq_and_ts_off,
1661 	.send_synack	=	tcp_v4_send_synack,
1662 };
1663 
1664 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1665 {
1666 	/* Never answer to SYNs send to broadcast or multicast */
1667 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1668 		goto drop;
1669 
1670 	return tcp_conn_request(&tcp_request_sock_ops,
1671 				&tcp_request_sock_ipv4_ops, sk, skb);
1672 
1673 drop:
1674 	tcp_listendrop(sk);
1675 	return 0;
1676 }
1677 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1678 
1679 
1680 /*
1681  * The three way handshake has completed - we got a valid synack -
1682  * now create the new socket.
1683  */
1684 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1685 				  struct request_sock *req,
1686 				  struct dst_entry *dst,
1687 				  struct request_sock *req_unhash,
1688 				  bool *own_req,
1689 				  void (*opt_child_init)(struct sock *newsk,
1690 							 const struct sock *sk))
1691 {
1692 	struct inet_request_sock *ireq;
1693 	bool found_dup_sk = false;
1694 	struct inet_sock *newinet;
1695 	struct tcp_sock *newtp;
1696 	struct sock *newsk;
1697 #ifdef CONFIG_TCP_MD5SIG
1698 	const union tcp_md5_addr *addr;
1699 	struct tcp_md5sig_key *key;
1700 	int l3index;
1701 #endif
1702 	struct ip_options_rcu *inet_opt;
1703 
1704 	if (sk_acceptq_is_full(sk))
1705 		goto exit_overflow;
1706 
1707 	newsk = tcp_create_openreq_child(sk, req, skb);
1708 	if (!newsk)
1709 		goto exit_nonewsk;
1710 
1711 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1712 	inet_sk_rx_dst_set(newsk, skb);
1713 
1714 	newtp		      = tcp_sk(newsk);
1715 	newinet		      = inet_sk(newsk);
1716 	ireq		      = inet_rsk(req);
1717 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1718 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1719 	newinet->mc_index     = inet_iif(skb);
1720 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1721 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1722 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1723 	if (inet_opt)
1724 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1725 	atomic_set(&newinet->inet_id, get_random_u16());
1726 
1727 	/* Set ToS of the new socket based upon the value of incoming SYN.
1728 	 * ECT bits are set later in tcp_init_transfer().
1729 	 */
1730 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1731 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1732 
1733 	if (!dst) {
1734 		dst = inet_csk_route_child_sock(sk, newsk, req);
1735 		if (!dst)
1736 			goto put_and_exit;
1737 	} else {
1738 		/* syncookie case : see end of cookie_v4_check() */
1739 	}
1740 	sk_setup_caps(newsk, dst);
1741 
1742 #if IS_ENABLED(CONFIG_IPV6)
1743 	if (opt_child_init)
1744 		opt_child_init(newsk, sk);
1745 #endif
1746 	tcp_ca_openreq_child(newsk, dst);
1747 
1748 	tcp_sync_mss(newsk, dst4_mtu(dst));
1749 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1750 
1751 	tcp_initialize_rcv_mss(newsk);
1752 
1753 #ifdef CONFIG_TCP_MD5SIG
1754 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1755 	/* Copy over the MD5 key from the original socket */
1756 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1757 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1758 	if (key && !tcp_rsk_used_ao(req)) {
1759 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1760 			goto put_and_exit;
1761 		sk_gso_disable(newsk);
1762 	}
1763 #endif
1764 #ifdef CONFIG_TCP_AO
1765 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1766 		goto put_and_exit; /* OOM, release back memory */
1767 #endif
1768 
1769 	if (__inet_inherit_port(sk, newsk) < 0)
1770 		goto put_and_exit;
1771 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1772 				       &found_dup_sk);
1773 	if (likely(*own_req)) {
1774 		tcp_move_syn(newtp, req);
1775 		ireq->ireq_opt = NULL;
1776 	} else {
1777 		newinet->inet_opt = NULL;
1778 
1779 		if (!req_unhash && found_dup_sk) {
1780 			/* This code path should only be executed in the
1781 			 * syncookie case only
1782 			 */
1783 			bh_unlock_sock(newsk);
1784 			sock_put(newsk);
1785 			newsk = NULL;
1786 		}
1787 	}
1788 	return newsk;
1789 
1790 exit_overflow:
1791 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1792 exit_nonewsk:
1793 	dst_release(dst);
1794 exit:
1795 	tcp_listendrop(sk);
1796 	return NULL;
1797 put_and_exit:
1798 	newinet->inet_opt = NULL;
1799 	inet_csk_prepare_forced_close(newsk);
1800 	tcp_done(newsk);
1801 	goto exit;
1802 }
1803 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1804 
1805 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1806 {
1807 #ifdef CONFIG_SYN_COOKIES
1808 	const struct tcphdr *th = tcp_hdr(skb);
1809 
1810 	if (!th->syn)
1811 		sk = cookie_v4_check(sk, skb);
1812 #endif
1813 	return sk;
1814 }
1815 
1816 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1817 			 struct tcphdr *th, u32 *cookie)
1818 {
1819 	u16 mss = 0;
1820 #ifdef CONFIG_SYN_COOKIES
1821 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1822 				    &tcp_request_sock_ipv4_ops, sk, th);
1823 	if (mss) {
1824 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1825 		tcp_synq_overflow(sk);
1826 	}
1827 #endif
1828 	return mss;
1829 }
1830 
1831 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1832 							   u32));
1833 /* The socket must have it's spinlock held when we get
1834  * here, unless it is a TCP_LISTEN socket.
1835  *
1836  * We have a potential double-lock case here, so even when
1837  * doing backlog processing we use the BH locking scheme.
1838  * This is because we cannot sleep with the original spinlock
1839  * held.
1840  */
1841 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1842 {
1843 	enum skb_drop_reason reason;
1844 	struct sock *rsk;
1845 
1846 	reason = psp_sk_rx_policy_check(sk, skb);
1847 	if (reason)
1848 		goto err_discard;
1849 
1850 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1851 		struct dst_entry *dst;
1852 
1853 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1854 						lockdep_sock_is_held(sk));
1855 
1856 		sock_rps_save_rxhash(sk, skb);
1857 		sk_mark_napi_id(sk, skb);
1858 		if (dst) {
1859 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1860 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1861 					     dst, 0)) {
1862 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1863 				dst_release(dst);
1864 			}
1865 		}
1866 		tcp_rcv_established(sk, skb);
1867 		return 0;
1868 	}
1869 
1870 	if (tcp_checksum_complete(skb))
1871 		goto csum_err;
1872 
1873 	if (sk->sk_state == TCP_LISTEN) {
1874 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1875 
1876 		if (!nsk)
1877 			return 0;
1878 		if (nsk != sk) {
1879 			reason = tcp_child_process(sk, nsk, skb);
1880 			if (reason) {
1881 				rsk = nsk;
1882 				goto reset;
1883 			}
1884 			return 0;
1885 		}
1886 	} else
1887 		sock_rps_save_rxhash(sk, skb);
1888 
1889 	reason = tcp_rcv_state_process(sk, skb);
1890 	if (reason) {
1891 		rsk = sk;
1892 		goto reset;
1893 	}
1894 	return 0;
1895 
1896 reset:
1897 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1898 discard:
1899 	sk_skb_reason_drop(sk, skb, reason);
1900 	/* Be careful here. If this function gets more complicated and
1901 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1902 	 * might be destroyed here. This current version compiles correctly,
1903 	 * but you have been warned.
1904 	 */
1905 	return 0;
1906 
1907 csum_err:
1908 	reason = SKB_DROP_REASON_TCP_CSUM;
1909 	trace_tcp_bad_csum(skb);
1910 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1911 err_discard:
1912 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1913 	goto discard;
1914 }
1915 EXPORT_SYMBOL(tcp_v4_do_rcv);
1916 
1917 int tcp_v4_early_demux(struct sk_buff *skb)
1918 {
1919 	struct net *net = dev_net_rcu(skb->dev);
1920 	const struct iphdr *iph;
1921 	const struct tcphdr *th;
1922 	struct sock *sk;
1923 
1924 	if (skb->pkt_type != PACKET_HOST)
1925 		return 0;
1926 
1927 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1928 		return 0;
1929 
1930 	iph = ip_hdr(skb);
1931 	th = tcp_hdr(skb);
1932 
1933 	if (th->doff < sizeof(struct tcphdr) / 4)
1934 		return 0;
1935 
1936 	sk = __inet_lookup_established(net, iph->saddr, th->source,
1937 				       iph->daddr, ntohs(th->dest),
1938 				       skb->skb_iif, inet_sdif(skb));
1939 	if (sk) {
1940 		skb->sk = sk;
1941 		skb->destructor = sock_edemux;
1942 		if (sk_fullsock(sk)) {
1943 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1944 
1945 			if (dst)
1946 				dst = dst_check(dst, 0);
1947 			if (dst &&
1948 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1949 				skb_dst_set_noref(skb, dst);
1950 		}
1951 	}
1952 	return 0;
1953 }
1954 
1955 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1956 		     enum skb_drop_reason *reason)
1957 {
1958 	u32 tail_gso_size, tail_gso_segs;
1959 	struct skb_shared_info *shinfo;
1960 	const struct tcphdr *th;
1961 	struct tcphdr *thtail;
1962 	struct sk_buff *tail;
1963 	unsigned int hdrlen;
1964 	bool fragstolen;
1965 	u32 gso_segs;
1966 	u32 gso_size;
1967 	u64 limit;
1968 	int delta;
1969 	int err;
1970 
1971 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1972 	 * we can fix skb->truesize to its real value to avoid future drops.
1973 	 * This is valid because skb is not yet charged to the socket.
1974 	 * It has been noticed pure SACK packets were sometimes dropped
1975 	 * (if cooked by drivers without copybreak feature).
1976 	 */
1977 	skb_condense(skb);
1978 
1979 	tcp_cleanup_skb(skb);
1980 
1981 	if (unlikely(tcp_checksum_complete(skb))) {
1982 		bh_unlock_sock(sk);
1983 		trace_tcp_bad_csum(skb);
1984 		*reason = SKB_DROP_REASON_TCP_CSUM;
1985 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1986 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1987 		return true;
1988 	}
1989 
1990 	/* Attempt coalescing to last skb in backlog, even if we are
1991 	 * above the limits.
1992 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1993 	 */
1994 	th = (const struct tcphdr *)skb->data;
1995 	hdrlen = th->doff * 4;
1996 
1997 	tail = sk->sk_backlog.tail;
1998 	if (!tail)
1999 		goto no_coalesce;
2000 	thtail = (struct tcphdr *)tail->data;
2001 
2002 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2003 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2004 	    ((TCP_SKB_CB(tail)->tcp_flags |
2005 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2006 	    !((TCP_SKB_CB(tail)->tcp_flags &
2007 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2008 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2009 	      TCP_SKB_CB(skb)->tcp_flags) &
2010 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2011 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2012 	    thtail->doff != th->doff ||
2013 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
2014 	    /* prior to PSP Rx policy check, retain exact PSP metadata */
2015 	    psp_skb_coalesce_diff(tail, skb))
2016 		goto no_coalesce;
2017 
2018 	__skb_pull(skb, hdrlen);
2019 
2020 	shinfo = skb_shinfo(skb);
2021 	gso_size = shinfo->gso_size ?: skb->len;
2022 	gso_segs = shinfo->gso_segs ?: 1;
2023 
2024 	shinfo = skb_shinfo(tail);
2025 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2026 	tail_gso_segs = shinfo->gso_segs ?: 1;
2027 
2028 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2029 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2030 
2031 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2032 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2033 			thtail->window = th->window;
2034 		}
2035 
2036 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2037 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2038 		 * is not entered if we append a packet with a FIN.
2039 		 * SYN, RST, URG are not present.
2040 		 * ACK is set on both packets.
2041 		 * PSH : we do not really care in TCP stack,
2042 		 *       at least for 'GRO' packets.
2043 		 */
2044 		thtail->fin |= th->fin;
2045 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2046 
2047 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2048 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2049 			tail->tstamp = skb->tstamp;
2050 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2051 		}
2052 
2053 		/* Not as strict as GRO. We only need to carry mss max value */
2054 		shinfo->gso_size = max(gso_size, tail_gso_size);
2055 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2056 
2057 		sk->sk_backlog.len += delta;
2058 		__NET_INC_STATS(sock_net(sk),
2059 				LINUX_MIB_TCPBACKLOGCOALESCE);
2060 		kfree_skb_partial(skb, fragstolen);
2061 		return false;
2062 	}
2063 	__skb_push(skb, hdrlen);
2064 
2065 no_coalesce:
2066 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2067 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2068 	 * sk_rcvbuf in normal conditions.
2069 	 */
2070 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2071 
2072 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2073 
2074 	/* Only socket owner can try to collapse/prune rx queues
2075 	 * to reduce memory overhead, so add a little headroom here.
2076 	 * Few sockets backlog are possibly concurrently non empty.
2077 	 */
2078 	limit += 64 * 1024;
2079 
2080 	limit = min_t(u64, limit, UINT_MAX);
2081 
2082 	err = sk_add_backlog(sk, skb, limit);
2083 	if (unlikely(err)) {
2084 		bh_unlock_sock(sk);
2085 		if (err == -ENOMEM) {
2086 			*reason = SKB_DROP_REASON_PFMEMALLOC;
2087 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2088 		} else {
2089 			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2090 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2091 		}
2092 		return true;
2093 	}
2094 	return false;
2095 }
2096 EXPORT_IPV6_MOD(tcp_add_backlog);
2097 
2098 static void tcp_v4_restore_cb(struct sk_buff *skb)
2099 {
2100 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2101 		sizeof(struct inet_skb_parm));
2102 }
2103 
2104 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2105 			   const struct tcphdr *th)
2106 {
2107 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2108 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2109 	 */
2110 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2111 		sizeof(struct inet_skb_parm));
2112 	barrier();
2113 
2114 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2115 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2116 				    skb->len - th->doff * 4);
2117 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2118 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2119 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2120 	TCP_SKB_CB(skb)->sacked	 = 0;
2121 	TCP_SKB_CB(skb)->has_rxtstamp =
2122 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2123 }
2124 
2125 /*
2126  *	From tcp_input.c
2127  */
2128 
2129 int tcp_v4_rcv(struct sk_buff *skb)
2130 {
2131 	struct net *net = dev_net_rcu(skb->dev);
2132 	enum skb_drop_reason drop_reason;
2133 	enum tcp_tw_status tw_status;
2134 	int sdif = inet_sdif(skb);
2135 	int dif = inet_iif(skb);
2136 	const struct iphdr *iph;
2137 	const struct tcphdr *th;
2138 	struct sock *sk = NULL;
2139 	bool refcounted;
2140 	int ret;
2141 	u32 isn;
2142 
2143 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2144 	if (skb->pkt_type != PACKET_HOST)
2145 		goto discard_it;
2146 
2147 	/* Count it even if it's bad */
2148 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2149 
2150 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2151 		goto discard_it;
2152 
2153 	th = (const struct tcphdr *)skb->data;
2154 
2155 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2156 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2157 		goto bad_packet;
2158 	}
2159 	if (!pskb_may_pull(skb, th->doff * 4))
2160 		goto discard_it;
2161 
2162 	/* An explanation is required here, I think.
2163 	 * Packet length and doff are validated by header prediction,
2164 	 * provided case of th->doff==0 is eliminated.
2165 	 * So, we defer the checks. */
2166 
2167 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2168 		goto csum_error;
2169 
2170 	th = (const struct tcphdr *)skb->data;
2171 	iph = ip_hdr(skb);
2172 lookup:
2173 	sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2174 			       th->dest, sdif, &refcounted);
2175 	if (!sk)
2176 		goto no_tcp_socket;
2177 
2178 	if (sk->sk_state == TCP_TIME_WAIT)
2179 		goto do_time_wait;
2180 
2181 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2182 		struct request_sock *req = inet_reqsk(sk);
2183 		bool req_stolen = false;
2184 		struct sock *nsk;
2185 
2186 		sk = req->rsk_listener;
2187 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2188 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2189 		else
2190 			drop_reason = tcp_inbound_hash(sk, req, skb,
2191 						       &iph->saddr, &iph->daddr,
2192 						       AF_INET, dif, sdif);
2193 		if (unlikely(drop_reason)) {
2194 			sk_drops_skbadd(sk, skb);
2195 			reqsk_put(req);
2196 			goto discard_it;
2197 		}
2198 		if (tcp_checksum_complete(skb)) {
2199 			reqsk_put(req);
2200 			goto csum_error;
2201 		}
2202 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2203 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2204 			if (!nsk) {
2205 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2206 				goto lookup;
2207 			}
2208 			sk = nsk;
2209 			/* reuseport_migrate_sock() has already held one sk_refcnt
2210 			 * before returning.
2211 			 */
2212 		} else {
2213 			/* We own a reference on the listener, increase it again
2214 			 * as we might lose it too soon.
2215 			 */
2216 			sock_hold(sk);
2217 		}
2218 		refcounted = true;
2219 		nsk = NULL;
2220 		if (!tcp_filter(sk, skb, &drop_reason)) {
2221 			th = (const struct tcphdr *)skb->data;
2222 			iph = ip_hdr(skb);
2223 			tcp_v4_fill_cb(skb, iph, th);
2224 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2225 					    &drop_reason);
2226 		}
2227 		if (!nsk) {
2228 			reqsk_put(req);
2229 			if (req_stolen) {
2230 				/* Another cpu got exclusive access to req
2231 				 * and created a full blown socket.
2232 				 * Try to feed this packet to this socket
2233 				 * instead of discarding it.
2234 				 */
2235 				tcp_v4_restore_cb(skb);
2236 				sock_put(sk);
2237 				goto lookup;
2238 			}
2239 			goto discard_and_relse;
2240 		}
2241 		nf_reset_ct(skb);
2242 		if (nsk == sk) {
2243 			reqsk_put(req);
2244 			tcp_v4_restore_cb(skb);
2245 		} else {
2246 			drop_reason = tcp_child_process(sk, nsk, skb);
2247 			if (drop_reason) {
2248 				enum sk_rst_reason rst_reason;
2249 
2250 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2251 				tcp_v4_send_reset(nsk, skb, rst_reason);
2252 				goto discard_and_relse;
2253 			}
2254 			sock_put(sk);
2255 			return 0;
2256 		}
2257 	}
2258 
2259 process:
2260 	if (static_branch_unlikely(&ip4_min_ttl)) {
2261 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2262 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2263 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2264 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2265 			goto discard_and_relse;
2266 		}
2267 	}
2268 
2269 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2270 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2271 		goto discard_and_relse;
2272 	}
2273 
2274 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2275 				       AF_INET, dif, sdif);
2276 	if (drop_reason)
2277 		goto discard_and_relse;
2278 
2279 	nf_reset_ct(skb);
2280 
2281 	if (tcp_filter(sk, skb, &drop_reason))
2282 		goto discard_and_relse;
2283 
2284 	th = (const struct tcphdr *)skb->data;
2285 	iph = ip_hdr(skb);
2286 	tcp_v4_fill_cb(skb, iph, th);
2287 
2288 	skb->dev = NULL;
2289 
2290 	if (sk->sk_state == TCP_LISTEN) {
2291 		ret = tcp_v4_do_rcv(sk, skb);
2292 		goto put_and_return;
2293 	}
2294 
2295 	sk_incoming_cpu_update(sk);
2296 
2297 	bh_lock_sock_nested(sk);
2298 	tcp_segs_in(tcp_sk(sk), skb);
2299 	ret = 0;
2300 	if (!sock_owned_by_user(sk)) {
2301 		ret = tcp_v4_do_rcv(sk, skb);
2302 	} else {
2303 		if (tcp_add_backlog(sk, skb, &drop_reason))
2304 			goto discard_and_relse;
2305 	}
2306 	bh_unlock_sock(sk);
2307 
2308 put_and_return:
2309 	if (refcounted)
2310 		sock_put(sk);
2311 
2312 	return ret;
2313 
2314 no_tcp_socket:
2315 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2316 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2317 		goto discard_it;
2318 
2319 	tcp_v4_fill_cb(skb, iph, th);
2320 
2321 	if (tcp_checksum_complete(skb)) {
2322 csum_error:
2323 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2324 		trace_tcp_bad_csum(skb);
2325 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2326 bad_packet:
2327 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2328 	} else {
2329 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2330 	}
2331 
2332 discard_it:
2333 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2334 	/* Discard frame. */
2335 	sk_skb_reason_drop(sk, skb, drop_reason);
2336 	return 0;
2337 
2338 discard_and_relse:
2339 	sk_drops_skbadd(sk, skb);
2340 	if (refcounted)
2341 		sock_put(sk);
2342 	goto discard_it;
2343 
2344 do_time_wait:
2345 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2346 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2347 		inet_twsk_put(inet_twsk(sk));
2348 		goto discard_it;
2349 	}
2350 
2351 	tcp_v4_fill_cb(skb, iph, th);
2352 
2353 	if (tcp_checksum_complete(skb)) {
2354 		inet_twsk_put(inet_twsk(sk));
2355 		goto csum_error;
2356 	}
2357 
2358 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2359 					       &drop_reason);
2360 	switch (tw_status) {
2361 	case TCP_TW_SYN: {
2362 		struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2363 							iph->saddr, th->source,
2364 							iph->daddr, th->dest,
2365 							inet_iif(skb),
2366 							sdif);
2367 		if (sk2) {
2368 			inet_twsk_deschedule_put(inet_twsk(sk));
2369 			sk = sk2;
2370 			tcp_v4_restore_cb(skb);
2371 			refcounted = false;
2372 			__this_cpu_write(tcp_tw_isn, isn);
2373 			goto process;
2374 		}
2375 
2376 		drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2377 		if (drop_reason)
2378 			break;
2379 	}
2380 		/* to ACK */
2381 		fallthrough;
2382 	case TCP_TW_ACK:
2383 	case TCP_TW_ACK_OOW:
2384 		tcp_v4_timewait_ack(sk, skb, tw_status);
2385 		break;
2386 	case TCP_TW_RST:
2387 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2388 		inet_twsk_deschedule_put(inet_twsk(sk));
2389 		goto discard_it;
2390 	case TCP_TW_SUCCESS:;
2391 	}
2392 	goto discard_it;
2393 }
2394 
2395 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2396 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2397 };
2398 
2399 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2400 {
2401 	struct dst_entry *dst = skb_dst(skb);
2402 
2403 	if (dst && dst_hold_safe(dst)) {
2404 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2405 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2406 	}
2407 }
2408 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2409 
2410 const struct inet_connection_sock_af_ops ipv4_specific = {
2411 	.queue_xmit	   = ip_queue_xmit,
2412 	.rebuild_header	   = inet_sk_rebuild_header,
2413 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2414 	.conn_request	   = tcp_v4_conn_request,
2415 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2416 	.net_header_len	   = sizeof(struct iphdr),
2417 	.setsockopt	   = ip_setsockopt,
2418 	.getsockopt	   = ip_getsockopt,
2419 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2420 };
2421 EXPORT_IPV6_MOD(ipv4_specific);
2422 
2423 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2424 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2425 #ifdef CONFIG_TCP_MD5SIG
2426 	.md5_lookup		= tcp_v4_md5_lookup,
2427 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2428 	.md5_parse		= tcp_v4_parse_md5_keys,
2429 #endif
2430 #ifdef CONFIG_TCP_AO
2431 	.ao_lookup		= tcp_v4_ao_lookup,
2432 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2433 	.ao_parse		= tcp_v4_parse_ao,
2434 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2435 #endif
2436 };
2437 
2438 static void tcp4_destruct_sock(struct sock *sk)
2439 {
2440 	tcp_md5_destruct_sock(sk);
2441 	tcp_ao_destroy_sock(sk, false);
2442 	inet_sock_destruct(sk);
2443 }
2444 #endif
2445 
2446 /* NOTE: A lot of things set to zero explicitly by call to
2447  *       sk_alloc() so need not be done here.
2448  */
2449 static int tcp_v4_init_sock(struct sock *sk)
2450 {
2451 	struct inet_connection_sock *icsk = inet_csk(sk);
2452 
2453 	tcp_init_sock(sk);
2454 
2455 	icsk->icsk_af_ops = &ipv4_specific;
2456 
2457 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2458 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2459 	sk->sk_destruct = tcp4_destruct_sock;
2460 #endif
2461 
2462 	return 0;
2463 }
2464 
2465 static void tcp_release_user_frags(struct sock *sk)
2466 {
2467 #ifdef CONFIG_PAGE_POOL
2468 	unsigned long index;
2469 	void *netmem;
2470 
2471 	xa_for_each(&sk->sk_user_frags, index, netmem)
2472 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2473 #endif
2474 }
2475 
2476 void tcp_v4_destroy_sock(struct sock *sk)
2477 {
2478 	struct tcp_sock *tp = tcp_sk(sk);
2479 
2480 	tcp_release_user_frags(sk);
2481 
2482 	xa_destroy(&sk->sk_user_frags);
2483 
2484 	trace_tcp_destroy_sock(sk);
2485 
2486 	tcp_clear_xmit_timers(sk);
2487 
2488 	tcp_cleanup_congestion_control(sk);
2489 
2490 	tcp_cleanup_ulp(sk);
2491 
2492 	/* Cleanup up the write buffer. */
2493 	tcp_write_queue_purge(sk);
2494 
2495 	/* Check if we want to disable active TFO */
2496 	tcp_fastopen_active_disable_ofo_check(sk);
2497 
2498 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2499 	skb_rbtree_purge(&tp->out_of_order_queue);
2500 
2501 	/* Clean up a referenced TCP bind bucket. */
2502 	if (inet_csk(sk)->icsk_bind_hash)
2503 		inet_put_port(sk);
2504 
2505 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2506 
2507 	/* If socket is aborted during connect operation */
2508 	tcp_free_fastopen_req(tp);
2509 	tcp_fastopen_destroy_cipher(sk);
2510 	tcp_saved_syn_free(tp);
2511 
2512 	sk_sockets_allocated_dec(sk);
2513 }
2514 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2515 
2516 #ifdef CONFIG_PROC_FS
2517 /* Proc filesystem TCP sock list dumping. */
2518 
2519 static unsigned short seq_file_family(const struct seq_file *seq);
2520 
2521 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2522 {
2523 	unsigned short family = seq_file_family(seq);
2524 
2525 	/* AF_UNSPEC is used as a match all */
2526 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2527 		net_eq(sock_net(sk), seq_file_net(seq)));
2528 }
2529 
2530 /* Find a non empty bucket (starting from st->bucket)
2531  * and return the first sk from it.
2532  */
2533 static void *listening_get_first(struct seq_file *seq)
2534 {
2535 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2536 	struct tcp_iter_state *st = seq->private;
2537 
2538 	st->offset = 0;
2539 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2540 		struct inet_listen_hashbucket *ilb2;
2541 		struct hlist_nulls_node *node;
2542 		struct sock *sk;
2543 
2544 		ilb2 = &hinfo->lhash2[st->bucket];
2545 		if (hlist_nulls_empty(&ilb2->nulls_head))
2546 			continue;
2547 
2548 		spin_lock(&ilb2->lock);
2549 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2550 			if (seq_sk_match(seq, sk))
2551 				return sk;
2552 		}
2553 		spin_unlock(&ilb2->lock);
2554 	}
2555 
2556 	return NULL;
2557 }
2558 
2559 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2560  * If "cur" is the last one in the st->bucket,
2561  * call listening_get_first() to return the first sk of the next
2562  * non empty bucket.
2563  */
2564 static void *listening_get_next(struct seq_file *seq, void *cur)
2565 {
2566 	struct tcp_iter_state *st = seq->private;
2567 	struct inet_listen_hashbucket *ilb2;
2568 	struct hlist_nulls_node *node;
2569 	struct inet_hashinfo *hinfo;
2570 	struct sock *sk = cur;
2571 
2572 	++st->num;
2573 	++st->offset;
2574 
2575 	sk = sk_nulls_next(sk);
2576 	sk_nulls_for_each_from(sk, node) {
2577 		if (seq_sk_match(seq, sk))
2578 			return sk;
2579 	}
2580 
2581 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2582 	ilb2 = &hinfo->lhash2[st->bucket];
2583 	spin_unlock(&ilb2->lock);
2584 	++st->bucket;
2585 	return listening_get_first(seq);
2586 }
2587 
2588 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2589 {
2590 	struct tcp_iter_state *st = seq->private;
2591 	void *rc;
2592 
2593 	st->bucket = 0;
2594 	st->offset = 0;
2595 	rc = listening_get_first(seq);
2596 
2597 	while (rc && *pos) {
2598 		rc = listening_get_next(seq, rc);
2599 		--*pos;
2600 	}
2601 	return rc;
2602 }
2603 
2604 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2605 				const struct tcp_iter_state *st)
2606 {
2607 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2608 }
2609 
2610 /*
2611  * Get first established socket starting from bucket given in st->bucket.
2612  * If st->bucket is zero, the very first socket in the hash is returned.
2613  */
2614 static void *established_get_first(struct seq_file *seq)
2615 {
2616 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2617 	struct tcp_iter_state *st = seq->private;
2618 
2619 	st->offset = 0;
2620 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2621 		struct sock *sk;
2622 		struct hlist_nulls_node *node;
2623 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2624 
2625 		cond_resched();
2626 
2627 		/* Lockless fast path for the common case of empty buckets */
2628 		if (empty_bucket(hinfo, st))
2629 			continue;
2630 
2631 		spin_lock_bh(lock);
2632 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2633 			if (seq_sk_match(seq, sk))
2634 				return sk;
2635 		}
2636 		spin_unlock_bh(lock);
2637 	}
2638 
2639 	return NULL;
2640 }
2641 
2642 static void *established_get_next(struct seq_file *seq, void *cur)
2643 {
2644 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2645 	struct tcp_iter_state *st = seq->private;
2646 	struct hlist_nulls_node *node;
2647 	struct sock *sk = cur;
2648 
2649 	++st->num;
2650 	++st->offset;
2651 
2652 	sk = sk_nulls_next(sk);
2653 
2654 	sk_nulls_for_each_from(sk, node) {
2655 		if (seq_sk_match(seq, sk))
2656 			return sk;
2657 	}
2658 
2659 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2660 	++st->bucket;
2661 	return established_get_first(seq);
2662 }
2663 
2664 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2665 {
2666 	struct tcp_iter_state *st = seq->private;
2667 	void *rc;
2668 
2669 	st->bucket = 0;
2670 	rc = established_get_first(seq);
2671 
2672 	while (rc && pos) {
2673 		rc = established_get_next(seq, rc);
2674 		--pos;
2675 	}
2676 	return rc;
2677 }
2678 
2679 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2680 {
2681 	void *rc;
2682 	struct tcp_iter_state *st = seq->private;
2683 
2684 	st->state = TCP_SEQ_STATE_LISTENING;
2685 	rc	  = listening_get_idx(seq, &pos);
2686 
2687 	if (!rc) {
2688 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2689 		rc	  = established_get_idx(seq, pos);
2690 	}
2691 
2692 	return rc;
2693 }
2694 
2695 static void *tcp_seek_last_pos(struct seq_file *seq)
2696 {
2697 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2698 	struct tcp_iter_state *st = seq->private;
2699 	int bucket = st->bucket;
2700 	int offset = st->offset;
2701 	int orig_num = st->num;
2702 	void *rc = NULL;
2703 
2704 	switch (st->state) {
2705 	case TCP_SEQ_STATE_LISTENING:
2706 		if (st->bucket > hinfo->lhash2_mask)
2707 			break;
2708 		rc = listening_get_first(seq);
2709 		while (offset-- && rc && bucket == st->bucket)
2710 			rc = listening_get_next(seq, rc);
2711 		if (rc)
2712 			break;
2713 		st->bucket = 0;
2714 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2715 		fallthrough;
2716 	case TCP_SEQ_STATE_ESTABLISHED:
2717 		if (st->bucket > hinfo->ehash_mask)
2718 			break;
2719 		rc = established_get_first(seq);
2720 		while (offset-- && rc && bucket == st->bucket)
2721 			rc = established_get_next(seq, rc);
2722 	}
2723 
2724 	st->num = orig_num;
2725 
2726 	return rc;
2727 }
2728 
2729 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2730 {
2731 	struct tcp_iter_state *st = seq->private;
2732 	void *rc;
2733 
2734 	if (*pos && *pos == st->last_pos) {
2735 		rc = tcp_seek_last_pos(seq);
2736 		if (rc)
2737 			goto out;
2738 	}
2739 
2740 	st->state = TCP_SEQ_STATE_LISTENING;
2741 	st->num = 0;
2742 	st->bucket = 0;
2743 	st->offset = 0;
2744 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2745 
2746 out:
2747 	st->last_pos = *pos;
2748 	return rc;
2749 }
2750 EXPORT_IPV6_MOD(tcp_seq_start);
2751 
2752 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2753 {
2754 	struct tcp_iter_state *st = seq->private;
2755 	void *rc = NULL;
2756 
2757 	if (v == SEQ_START_TOKEN) {
2758 		rc = tcp_get_idx(seq, 0);
2759 		goto out;
2760 	}
2761 
2762 	switch (st->state) {
2763 	case TCP_SEQ_STATE_LISTENING:
2764 		rc = listening_get_next(seq, v);
2765 		if (!rc) {
2766 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2767 			st->bucket = 0;
2768 			st->offset = 0;
2769 			rc	  = established_get_first(seq);
2770 		}
2771 		break;
2772 	case TCP_SEQ_STATE_ESTABLISHED:
2773 		rc = established_get_next(seq, v);
2774 		break;
2775 	}
2776 out:
2777 	++*pos;
2778 	st->last_pos = *pos;
2779 	return rc;
2780 }
2781 EXPORT_IPV6_MOD(tcp_seq_next);
2782 
2783 void tcp_seq_stop(struct seq_file *seq, void *v)
2784 {
2785 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2786 	struct tcp_iter_state *st = seq->private;
2787 
2788 	switch (st->state) {
2789 	case TCP_SEQ_STATE_LISTENING:
2790 		if (v != SEQ_START_TOKEN)
2791 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2792 		break;
2793 	case TCP_SEQ_STATE_ESTABLISHED:
2794 		if (v)
2795 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2796 		break;
2797 	}
2798 }
2799 EXPORT_IPV6_MOD(tcp_seq_stop);
2800 
2801 static void get_openreq4(const struct request_sock *req,
2802 			 struct seq_file *f, int i)
2803 {
2804 	const struct inet_request_sock *ireq = inet_rsk(req);
2805 	long delta = req->rsk_timer.expires - jiffies;
2806 
2807 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2808 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2809 		i,
2810 		ireq->ir_loc_addr,
2811 		ireq->ir_num,
2812 		ireq->ir_rmt_addr,
2813 		ntohs(ireq->ir_rmt_port),
2814 		TCP_SYN_RECV,
2815 		0, 0, /* could print option size, but that is af dependent. */
2816 		1,    /* timers active (only the expire timer) */
2817 		jiffies_delta_to_clock_t(delta),
2818 		req->num_timeout,
2819 		from_kuid_munged(seq_user_ns(f),
2820 				 sk_uid(req->rsk_listener)),
2821 		0,  /* non standard timer */
2822 		0, /* open_requests have no inode */
2823 		0,
2824 		req);
2825 }
2826 
2827 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2828 {
2829 	int timer_active;
2830 	unsigned long timer_expires;
2831 	const struct tcp_sock *tp = tcp_sk(sk);
2832 	const struct inet_connection_sock *icsk = inet_csk(sk);
2833 	const struct inet_sock *inet = inet_sk(sk);
2834 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2835 	__be32 dest = inet->inet_daddr;
2836 	__be32 src = inet->inet_rcv_saddr;
2837 	__u16 destp = ntohs(inet->inet_dport);
2838 	__u16 srcp = ntohs(inet->inet_sport);
2839 	u8 icsk_pending;
2840 	int rx_queue;
2841 	int state;
2842 
2843 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2844 	if (icsk_pending == ICSK_TIME_RETRANS ||
2845 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2846 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2847 		timer_active	= 1;
2848 		timer_expires	= tcp_timeout_expires(sk);
2849 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2850 		timer_active	= 4;
2851 		timer_expires	= tcp_timeout_expires(sk);
2852 	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
2853 		timer_active	= 2;
2854 		timer_expires	= icsk->icsk_keepalive_timer.expires;
2855 	} else {
2856 		timer_active	= 0;
2857 		timer_expires = jiffies;
2858 	}
2859 
2860 	state = inet_sk_state_load(sk);
2861 	if (state == TCP_LISTEN)
2862 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2863 	else
2864 		/* Because we don't lock the socket,
2865 		 * we might find a transient negative value.
2866 		 */
2867 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2868 				      READ_ONCE(tp->copied_seq), 0);
2869 
2870 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2871 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2872 		i, src, srcp, dest, destp, state,
2873 		READ_ONCE(tp->write_seq) - tp->snd_una,
2874 		rx_queue,
2875 		timer_active,
2876 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2877 		READ_ONCE(icsk->icsk_retransmits),
2878 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2879 		READ_ONCE(icsk->icsk_probes_out),
2880 		sock_i_ino(sk),
2881 		refcount_read(&sk->sk_refcnt), sk,
2882 		jiffies_to_clock_t(icsk->icsk_rto),
2883 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2884 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2885 		tcp_snd_cwnd(tp),
2886 		state == TCP_LISTEN ?
2887 		    fastopenq->max_qlen :
2888 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2889 }
2890 
2891 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2892 			       struct seq_file *f, int i)
2893 {
2894 	long delta = tw->tw_timer.expires - jiffies;
2895 	__be32 dest, src;
2896 	__u16 destp, srcp;
2897 
2898 	dest  = tw->tw_daddr;
2899 	src   = tw->tw_rcv_saddr;
2900 	destp = ntohs(tw->tw_dport);
2901 	srcp  = ntohs(tw->tw_sport);
2902 
2903 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2904 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2905 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2906 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2907 		refcount_read(&tw->tw_refcnt), tw);
2908 }
2909 
2910 #define TMPSZ 150
2911 
2912 static int tcp4_seq_show(struct seq_file *seq, void *v)
2913 {
2914 	struct tcp_iter_state *st;
2915 	struct sock *sk = v;
2916 
2917 	seq_setwidth(seq, TMPSZ - 1);
2918 	if (v == SEQ_START_TOKEN) {
2919 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2920 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2921 			   "inode");
2922 		goto out;
2923 	}
2924 	st = seq->private;
2925 
2926 	if (sk->sk_state == TCP_TIME_WAIT)
2927 		get_timewait4_sock(v, seq, st->num);
2928 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2929 		get_openreq4(v, seq, st->num);
2930 	else
2931 		get_tcp4_sock(v, seq, st->num);
2932 out:
2933 	seq_pad(seq, '\n');
2934 	return 0;
2935 }
2936 
2937 #ifdef CONFIG_BPF_SYSCALL
2938 union bpf_tcp_iter_batch_item {
2939 	struct sock *sk;
2940 	__u64 cookie;
2941 };
2942 
2943 struct bpf_tcp_iter_state {
2944 	struct tcp_iter_state state;
2945 	unsigned int cur_sk;
2946 	unsigned int end_sk;
2947 	unsigned int max_sk;
2948 	union bpf_tcp_iter_batch_item *batch;
2949 };
2950 
2951 struct bpf_iter__tcp {
2952 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2953 	__bpf_md_ptr(struct sock_common *, sk_common);
2954 	uid_t uid __aligned(8);
2955 };
2956 
2957 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2958 			     struct sock_common *sk_common, uid_t uid)
2959 {
2960 	struct bpf_iter__tcp ctx;
2961 
2962 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2963 	ctx.meta = meta;
2964 	ctx.sk_common = sk_common;
2965 	ctx.uid = uid;
2966 	return bpf_iter_run_prog(prog, &ctx);
2967 }
2968 
2969 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2970 {
2971 	union bpf_tcp_iter_batch_item *item;
2972 	unsigned int cur_sk = iter->cur_sk;
2973 	__u64 cookie;
2974 
2975 	/* Remember the cookies of the sockets we haven't seen yet, so we can
2976 	 * pick up where we left off next time around.
2977 	 */
2978 	while (cur_sk < iter->end_sk) {
2979 		item = &iter->batch[cur_sk++];
2980 		cookie = sock_gen_cookie(item->sk);
2981 		sock_gen_put(item->sk);
2982 		item->cookie = cookie;
2983 	}
2984 }
2985 
2986 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2987 				      unsigned int new_batch_sz, gfp_t flags)
2988 {
2989 	union bpf_tcp_iter_batch_item *new_batch;
2990 
2991 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2992 			     flags | __GFP_NOWARN);
2993 	if (!new_batch)
2994 		return -ENOMEM;
2995 
2996 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
2997 	kvfree(iter->batch);
2998 	iter->batch = new_batch;
2999 	iter->max_sk = new_batch_sz;
3000 
3001 	return 0;
3002 }
3003 
3004 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3005 					       union bpf_tcp_iter_batch_item *cookies,
3006 					       int n_cookies)
3007 {
3008 	struct hlist_nulls_node *node;
3009 	struct sock *sk;
3010 	int i;
3011 
3012 	for (i = 0; i < n_cookies; i++) {
3013 		sk = first_sk;
3014 		sk_nulls_for_each_from(sk, node)
3015 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3016 				return sk;
3017 	}
3018 
3019 	return NULL;
3020 }
3021 
3022 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3023 {
3024 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3025 	struct bpf_tcp_iter_state *iter = seq->private;
3026 	struct tcp_iter_state *st = &iter->state;
3027 	unsigned int find_cookie = iter->cur_sk;
3028 	unsigned int end_cookie = iter->end_sk;
3029 	int resume_bucket = st->bucket;
3030 	struct sock *sk;
3031 
3032 	if (end_cookie && find_cookie == end_cookie)
3033 		++st->bucket;
3034 
3035 	sk = listening_get_first(seq);
3036 	iter->cur_sk = 0;
3037 	iter->end_sk = 0;
3038 
3039 	if (sk && st->bucket == resume_bucket && end_cookie) {
3040 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3041 						end_cookie - find_cookie);
3042 		if (!sk) {
3043 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
3044 			++st->bucket;
3045 			sk = listening_get_first(seq);
3046 		}
3047 	}
3048 
3049 	return sk;
3050 }
3051 
3052 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3053 {
3054 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3055 	struct bpf_tcp_iter_state *iter = seq->private;
3056 	struct tcp_iter_state *st = &iter->state;
3057 	unsigned int find_cookie = iter->cur_sk;
3058 	unsigned int end_cookie = iter->end_sk;
3059 	int resume_bucket = st->bucket;
3060 	struct sock *sk;
3061 
3062 	if (end_cookie && find_cookie == end_cookie)
3063 		++st->bucket;
3064 
3065 	sk = established_get_first(seq);
3066 	iter->cur_sk = 0;
3067 	iter->end_sk = 0;
3068 
3069 	if (sk && st->bucket == resume_bucket && end_cookie) {
3070 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3071 						end_cookie - find_cookie);
3072 		if (!sk) {
3073 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3074 			++st->bucket;
3075 			sk = established_get_first(seq);
3076 		}
3077 	}
3078 
3079 	return sk;
3080 }
3081 
3082 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3083 {
3084 	struct bpf_tcp_iter_state *iter = seq->private;
3085 	struct tcp_iter_state *st = &iter->state;
3086 	struct sock *sk = NULL;
3087 
3088 	switch (st->state) {
3089 	case TCP_SEQ_STATE_LISTENING:
3090 		sk = bpf_iter_tcp_resume_listening(seq);
3091 		if (sk)
3092 			break;
3093 		st->bucket = 0;
3094 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3095 		fallthrough;
3096 	case TCP_SEQ_STATE_ESTABLISHED:
3097 		sk = bpf_iter_tcp_resume_established(seq);
3098 		break;
3099 	}
3100 
3101 	return sk;
3102 }
3103 
3104 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3105 						 struct sock **start_sk)
3106 {
3107 	struct bpf_tcp_iter_state *iter = seq->private;
3108 	struct hlist_nulls_node *node;
3109 	unsigned int expected = 1;
3110 	struct sock *sk;
3111 
3112 	sock_hold(*start_sk);
3113 	iter->batch[iter->end_sk++].sk = *start_sk;
3114 
3115 	sk = sk_nulls_next(*start_sk);
3116 	*start_sk = NULL;
3117 	sk_nulls_for_each_from(sk, node) {
3118 		if (seq_sk_match(seq, sk)) {
3119 			if (iter->end_sk < iter->max_sk) {
3120 				sock_hold(sk);
3121 				iter->batch[iter->end_sk++].sk = sk;
3122 			} else if (!*start_sk) {
3123 				/* Remember where we left off. */
3124 				*start_sk = sk;
3125 			}
3126 			expected++;
3127 		}
3128 	}
3129 
3130 	return expected;
3131 }
3132 
3133 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3134 						   struct sock **start_sk)
3135 {
3136 	struct bpf_tcp_iter_state *iter = seq->private;
3137 	struct hlist_nulls_node *node;
3138 	unsigned int expected = 1;
3139 	struct sock *sk;
3140 
3141 	sock_hold(*start_sk);
3142 	iter->batch[iter->end_sk++].sk = *start_sk;
3143 
3144 	sk = sk_nulls_next(*start_sk);
3145 	*start_sk = NULL;
3146 	sk_nulls_for_each_from(sk, node) {
3147 		if (seq_sk_match(seq, sk)) {
3148 			if (iter->end_sk < iter->max_sk) {
3149 				sock_hold(sk);
3150 				iter->batch[iter->end_sk++].sk = sk;
3151 			} else if (!*start_sk) {
3152 				/* Remember where we left off. */
3153 				*start_sk = sk;
3154 			}
3155 			expected++;
3156 		}
3157 	}
3158 
3159 	return expected;
3160 }
3161 
3162 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3163 					struct sock **start_sk)
3164 {
3165 	struct bpf_tcp_iter_state *iter = seq->private;
3166 	struct tcp_iter_state *st = &iter->state;
3167 
3168 	if (st->state == TCP_SEQ_STATE_LISTENING)
3169 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3170 	else
3171 		return bpf_iter_tcp_established_batch(seq, start_sk);
3172 }
3173 
3174 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3175 {
3176 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3177 	struct bpf_tcp_iter_state *iter = seq->private;
3178 	struct tcp_iter_state *st = &iter->state;
3179 
3180 	if (st->state == TCP_SEQ_STATE_LISTENING)
3181 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3182 	else
3183 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3184 }
3185 
3186 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3187 {
3188 	struct bpf_tcp_iter_state *iter = seq->private;
3189 	unsigned int expected;
3190 	struct sock *sk;
3191 	int err;
3192 
3193 	sk = bpf_iter_tcp_resume(seq);
3194 	if (!sk)
3195 		return NULL; /* Done */
3196 
3197 	expected = bpf_iter_fill_batch(seq, &sk);
3198 	if (likely(iter->end_sk == expected))
3199 		goto done;
3200 
3201 	/* Batch size was too small. */
3202 	bpf_iter_tcp_unlock_bucket(seq);
3203 	bpf_iter_tcp_put_batch(iter);
3204 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3205 					 GFP_USER);
3206 	if (err)
3207 		return ERR_PTR(err);
3208 
3209 	sk = bpf_iter_tcp_resume(seq);
3210 	if (!sk)
3211 		return NULL; /* Done */
3212 
3213 	expected = bpf_iter_fill_batch(seq, &sk);
3214 	if (likely(iter->end_sk == expected))
3215 		goto done;
3216 
3217 	/* Batch size was still too small. Hold onto the lock while we try
3218 	 * again with a larger batch to make sure the current bucket's size
3219 	 * does not change in the meantime.
3220 	 */
3221 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3222 	if (err) {
3223 		bpf_iter_tcp_unlock_bucket(seq);
3224 		return ERR_PTR(err);
3225 	}
3226 
3227 	expected = bpf_iter_fill_batch(seq, &sk);
3228 	WARN_ON_ONCE(iter->end_sk != expected);
3229 done:
3230 	bpf_iter_tcp_unlock_bucket(seq);
3231 	return iter->batch[0].sk;
3232 }
3233 
3234 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3235 {
3236 	/* bpf iter does not support lseek, so it always
3237 	 * continue from where it was stop()-ped.
3238 	 */
3239 	if (*pos)
3240 		return bpf_iter_tcp_batch(seq);
3241 
3242 	return SEQ_START_TOKEN;
3243 }
3244 
3245 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3246 {
3247 	struct bpf_tcp_iter_state *iter = seq->private;
3248 	struct tcp_iter_state *st = &iter->state;
3249 	struct sock *sk;
3250 
3251 	/* Whenever seq_next() is called, the iter->cur_sk is
3252 	 * done with seq_show(), so advance to the next sk in
3253 	 * the batch.
3254 	 */
3255 	if (iter->cur_sk < iter->end_sk) {
3256 		/* Keeping st->num consistent in tcp_iter_state.
3257 		 * bpf_iter_tcp does not use st->num.
3258 		 * meta.seq_num is used instead.
3259 		 */
3260 		st->num++;
3261 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3262 	}
3263 
3264 	if (iter->cur_sk < iter->end_sk)
3265 		sk = iter->batch[iter->cur_sk].sk;
3266 	else
3267 		sk = bpf_iter_tcp_batch(seq);
3268 
3269 	++*pos;
3270 	/* Keeping st->last_pos consistent in tcp_iter_state.
3271 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3272 	 */
3273 	st->last_pos = *pos;
3274 	return sk;
3275 }
3276 
3277 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3278 {
3279 	struct bpf_iter_meta meta;
3280 	struct bpf_prog *prog;
3281 	struct sock *sk = v;
3282 	uid_t uid;
3283 	int ret;
3284 
3285 	if (v == SEQ_START_TOKEN)
3286 		return 0;
3287 
3288 	if (sk_fullsock(sk))
3289 		lock_sock(sk);
3290 
3291 	if (unlikely(sk_unhashed(sk))) {
3292 		ret = SEQ_SKIP;
3293 		goto unlock;
3294 	}
3295 
3296 	if (sk->sk_state == TCP_TIME_WAIT) {
3297 		uid = 0;
3298 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3299 		const struct request_sock *req = v;
3300 
3301 		uid = from_kuid_munged(seq_user_ns(seq),
3302 				       sk_uid(req->rsk_listener));
3303 	} else {
3304 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3305 	}
3306 
3307 	meta.seq = seq;
3308 	prog = bpf_iter_get_info(&meta, false);
3309 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3310 
3311 unlock:
3312 	if (sk_fullsock(sk))
3313 		release_sock(sk);
3314 	return ret;
3315 
3316 }
3317 
3318 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3319 {
3320 	struct bpf_tcp_iter_state *iter = seq->private;
3321 	struct bpf_iter_meta meta;
3322 	struct bpf_prog *prog;
3323 
3324 	if (!v) {
3325 		meta.seq = seq;
3326 		prog = bpf_iter_get_info(&meta, true);
3327 		if (prog)
3328 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3329 	}
3330 
3331 	if (iter->cur_sk < iter->end_sk)
3332 		bpf_iter_tcp_put_batch(iter);
3333 }
3334 
3335 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3336 	.show		= bpf_iter_tcp_seq_show,
3337 	.start		= bpf_iter_tcp_seq_start,
3338 	.next		= bpf_iter_tcp_seq_next,
3339 	.stop		= bpf_iter_tcp_seq_stop,
3340 };
3341 #endif
3342 static unsigned short seq_file_family(const struct seq_file *seq)
3343 {
3344 	const struct tcp_seq_afinfo *afinfo;
3345 
3346 #ifdef CONFIG_BPF_SYSCALL
3347 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3348 	if (seq->op == &bpf_iter_tcp_seq_ops)
3349 		return AF_UNSPEC;
3350 #endif
3351 
3352 	/* Iterated from proc fs */
3353 	afinfo = pde_data(file_inode(seq->file));
3354 	return afinfo->family;
3355 }
3356 
3357 static const struct seq_operations tcp4_seq_ops = {
3358 	.show		= tcp4_seq_show,
3359 	.start		= tcp_seq_start,
3360 	.next		= tcp_seq_next,
3361 	.stop		= tcp_seq_stop,
3362 };
3363 
3364 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3365 	.family		= AF_INET,
3366 };
3367 
3368 static int __net_init tcp4_proc_init_net(struct net *net)
3369 {
3370 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3371 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3372 		return -ENOMEM;
3373 	return 0;
3374 }
3375 
3376 static void __net_exit tcp4_proc_exit_net(struct net *net)
3377 {
3378 	remove_proc_entry("tcp", net->proc_net);
3379 }
3380 
3381 static struct pernet_operations tcp4_net_ops = {
3382 	.init = tcp4_proc_init_net,
3383 	.exit = tcp4_proc_exit_net,
3384 };
3385 
3386 int __init tcp4_proc_init(void)
3387 {
3388 	return register_pernet_subsys(&tcp4_net_ops);
3389 }
3390 
3391 void tcp4_proc_exit(void)
3392 {
3393 	unregister_pernet_subsys(&tcp4_net_ops);
3394 }
3395 #endif /* CONFIG_PROC_FS */
3396 
3397 struct proto tcp_prot = {
3398 	.name			= "TCP",
3399 	.owner			= THIS_MODULE,
3400 	.close			= tcp_close,
3401 	.pre_connect		= tcp_v4_pre_connect,
3402 	.connect		= tcp_v4_connect,
3403 	.disconnect		= tcp_disconnect,
3404 	.accept			= inet_csk_accept,
3405 	.ioctl			= tcp_ioctl,
3406 	.init			= tcp_v4_init_sock,
3407 	.destroy		= tcp_v4_destroy_sock,
3408 	.shutdown		= tcp_shutdown,
3409 	.setsockopt		= tcp_setsockopt,
3410 	.getsockopt		= tcp_getsockopt,
3411 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3412 	.keepalive		= tcp_set_keepalive,
3413 	.recvmsg		= tcp_recvmsg,
3414 	.sendmsg		= tcp_sendmsg,
3415 	.splice_eof		= tcp_splice_eof,
3416 	.backlog_rcv		= tcp_v4_do_rcv,
3417 	.release_cb		= tcp_release_cb,
3418 	.hash			= inet_hash,
3419 	.unhash			= inet_unhash,
3420 	.get_port		= inet_csk_get_port,
3421 	.put_port		= inet_put_port,
3422 #ifdef CONFIG_BPF_SYSCALL
3423 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3424 #endif
3425 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3426 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3427 	.stream_memory_free	= tcp_stream_memory_free,
3428 	.sockets_allocated	= &tcp_sockets_allocated,
3429 
3430 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3431 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3432 
3433 	.memory_pressure	= &tcp_memory_pressure,
3434 	.sysctl_mem		= sysctl_tcp_mem,
3435 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3436 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3437 	.max_header		= MAX_TCP_HEADER,
3438 	.obj_size		= sizeof(struct tcp_sock),
3439 	.freeptr_offset		= offsetof(struct tcp_sock,
3440 					   inet_conn.icsk_inet.sk.sk_freeptr),
3441 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3442 	.twsk_prot		= &tcp_timewait_sock_ops,
3443 	.rsk_prot		= &tcp_request_sock_ops,
3444 	.h.hashinfo		= NULL,
3445 	.no_autobind		= true,
3446 	.diag_destroy		= tcp_abort,
3447 };
3448 EXPORT_SYMBOL(tcp_prot);
3449 
3450 static void __net_exit tcp_sk_exit(struct net *net)
3451 {
3452 	if (net->ipv4.tcp_congestion_control)
3453 		bpf_module_put(net->ipv4.tcp_congestion_control,
3454 			       net->ipv4.tcp_congestion_control->owner);
3455 }
3456 
3457 static void __net_init tcp_set_hashinfo(struct net *net)
3458 {
3459 	struct inet_hashinfo *hinfo;
3460 	unsigned int ehash_entries;
3461 	struct net *old_net;
3462 
3463 	if (net_eq(net, &init_net))
3464 		goto fallback;
3465 
3466 	old_net = current->nsproxy->net_ns;
3467 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3468 	if (!ehash_entries)
3469 		goto fallback;
3470 
3471 	ehash_entries = roundup_pow_of_two(ehash_entries);
3472 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3473 	if (!hinfo) {
3474 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3475 			"for a netns, fallback to the global one\n",
3476 			ehash_entries);
3477 fallback:
3478 		hinfo = &tcp_hashinfo;
3479 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3480 	}
3481 
3482 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3483 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3484 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3485 }
3486 
3487 static int __net_init tcp_sk_init(struct net *net)
3488 {
3489 	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3490 	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3491 	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3492 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3493 
3494 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3495 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3496 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3497 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3498 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3499 
3500 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3501 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3502 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3503 
3504 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3505 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3506 	net->ipv4.sysctl_tcp_syncookies = 1;
3507 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3508 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3509 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3510 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3511 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3512 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3513 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3514 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3515 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3516 
3517 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3518 	tcp_set_hashinfo(net);
3519 
3520 	net->ipv4.sysctl_tcp_sack = 1;
3521 	net->ipv4.sysctl_tcp_window_scaling = 1;
3522 	net->ipv4.sysctl_tcp_timestamps = 1;
3523 	net->ipv4.sysctl_tcp_early_retrans = 3;
3524 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3525 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3526 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3527 	net->ipv4.sysctl_tcp_max_reordering = 300;
3528 	net->ipv4.sysctl_tcp_dsack = 1;
3529 	net->ipv4.sysctl_tcp_app_win = 31;
3530 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3531 	net->ipv4.sysctl_tcp_frto = 2;
3532 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3533 	net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3534 	/* This limits the percentage of the congestion window which we
3535 	 * will allow a single TSO frame to consume.  Building TSO frames
3536 	 * which are too large can cause TCP streams to be bursty.
3537 	 */
3538 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3539 	/* Default TSQ limit of 4 MB */
3540 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3541 
3542 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3543 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3544 
3545 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3546 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3547 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3548 	net->ipv4.sysctl_tcp_autocorking = 1;
3549 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3550 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3551 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3552 	if (net != &init_net) {
3553 		memcpy(net->ipv4.sysctl_tcp_rmem,
3554 		       init_net.ipv4.sysctl_tcp_rmem,
3555 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3556 		memcpy(net->ipv4.sysctl_tcp_wmem,
3557 		       init_net.ipv4.sysctl_tcp_wmem,
3558 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3559 	}
3560 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3561 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
3562 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3563 	net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
3564 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3565 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3566 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3567 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3568 
3569 	/* Set default values for PLB */
3570 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3571 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3572 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3573 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3574 	/* Default congestion threshold for PLB to mark a round is 50% */
3575 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3576 
3577 	/* Reno is always built in */
3578 	if (!net_eq(net, &init_net) &&
3579 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3580 			       init_net.ipv4.tcp_congestion_control->owner))
3581 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3582 	else
3583 		net->ipv4.tcp_congestion_control = &tcp_reno;
3584 
3585 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3586 	net->ipv4.sysctl_tcp_shrink_window = 0;
3587 
3588 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3589 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3590 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3591 
3592 	return 0;
3593 }
3594 
3595 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3596 {
3597 	struct net *net;
3598 
3599 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3600 	 * and failed setup_net error unwinding path are serialized.
3601 	 *
3602 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3603 	 * net_exit_list, the thread that dismantles a particular twsk must
3604 	 * do so without other thread progressing to refcount_dec_and_test() of
3605 	 * tcp_death_row.tw_refcount.
3606 	 */
3607 	mutex_lock(&tcp_exit_batch_mutex);
3608 
3609 	tcp_twsk_purge(net_exit_list);
3610 
3611 	list_for_each_entry(net, net_exit_list, exit_list) {
3612 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3613 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3614 		tcp_fastopen_ctx_destroy(net);
3615 	}
3616 
3617 	mutex_unlock(&tcp_exit_batch_mutex);
3618 }
3619 
3620 static struct pernet_operations __net_initdata tcp_sk_ops = {
3621        .init	   = tcp_sk_init,
3622        .exit	   = tcp_sk_exit,
3623        .exit_batch = tcp_sk_exit_batch,
3624 };
3625 
3626 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3627 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3628 		     struct sock_common *sk_common, uid_t uid)
3629 
3630 #define INIT_BATCH_SZ 16
3631 
3632 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3633 {
3634 	struct bpf_tcp_iter_state *iter = priv_data;
3635 	int err;
3636 
3637 	err = bpf_iter_init_seq_net(priv_data, aux);
3638 	if (err)
3639 		return err;
3640 
3641 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3642 	if (err) {
3643 		bpf_iter_fini_seq_net(priv_data);
3644 		return err;
3645 	}
3646 
3647 	return 0;
3648 }
3649 
3650 static void bpf_iter_fini_tcp(void *priv_data)
3651 {
3652 	struct bpf_tcp_iter_state *iter = priv_data;
3653 
3654 	bpf_iter_fini_seq_net(priv_data);
3655 	kvfree(iter->batch);
3656 }
3657 
3658 static const struct bpf_iter_seq_info tcp_seq_info = {
3659 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3660 	.init_seq_private	= bpf_iter_init_tcp,
3661 	.fini_seq_private	= bpf_iter_fini_tcp,
3662 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3663 };
3664 
3665 static const struct bpf_func_proto *
3666 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3667 			    const struct bpf_prog *prog)
3668 {
3669 	switch (func_id) {
3670 	case BPF_FUNC_setsockopt:
3671 		return &bpf_sk_setsockopt_proto;
3672 	case BPF_FUNC_getsockopt:
3673 		return &bpf_sk_getsockopt_proto;
3674 	default:
3675 		return NULL;
3676 	}
3677 }
3678 
3679 static struct bpf_iter_reg tcp_reg_info = {
3680 	.target			= "tcp",
3681 	.ctx_arg_info_size	= 1,
3682 	.ctx_arg_info		= {
3683 		{ offsetof(struct bpf_iter__tcp, sk_common),
3684 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3685 	},
3686 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3687 	.seq_info		= &tcp_seq_info,
3688 };
3689 
3690 static void __init bpf_iter_register(void)
3691 {
3692 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3693 	if (bpf_iter_reg_target(&tcp_reg_info))
3694 		pr_warn("Warning: could not register bpf iterator tcp\n");
3695 }
3696 
3697 #endif
3698 
3699 void __init tcp_v4_init(void)
3700 {
3701 	int cpu, res;
3702 
3703 	for_each_possible_cpu(cpu) {
3704 		struct sock *sk;
3705 
3706 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3707 					   IPPROTO_TCP, &init_net);
3708 		if (res)
3709 			panic("Failed to create the TCP control socket.\n");
3710 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3711 
3712 		/* Please enforce IP_DF and IPID==0 for RST and
3713 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3714 		 */
3715 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3716 
3717 		sk->sk_clockid = CLOCK_MONOTONIC;
3718 
3719 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3720 	}
3721 	if (register_pernet_subsys(&tcp_sk_ops))
3722 		panic("Failed to create the TCP control socket.\n");
3723 
3724 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3725 	bpf_iter_register();
3726 #endif
3727 }
3728