xref: /linux/net/ipv4/tcp_ipv4.c (revision 8341c989ac77d712c7d6e2bce29e8a4bcb2eeae4)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/fips.h>
57 #include <linux/jhash.h>
58 #include <linux/init.h>
59 #include <linux/times.h>
60 #include <linux/slab.h>
61 #include <linux/sched.h>
62 #include <linux/sock_diag.h>
63 
64 #include <net/aligned_data.h>
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/tcp_ecn.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/inet_ecn.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/busy_poll.h>
78 #include <net/rstreason.h>
79 #include <net/psp.h>
80 
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86 #include <linux/inetdevice.h>
87 #include <linux/btf_ids.h>
88 #include <linux/skbuff_ref.h>
89 
90 #include <crypto/md5.h>
91 
92 #include <trace/events/tcp.h>
93 
94 #ifdef CONFIG_TCP_MD5SIG
95 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
96 				__be32 daddr, __be32 saddr, const struct tcphdr *th);
97 #endif
98 
99 struct inet_hashinfo tcp_hashinfo;
100 
101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
102 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
103 };
104 
105 static DEFINE_MUTEX(tcp_exit_batch_mutex);
106 
107 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
108 {
109 	return secure_tcp_seq(ip_hdr(skb)->daddr,
110 			      ip_hdr(skb)->saddr,
111 			      tcp_hdr(skb)->dest,
112 			      tcp_hdr(skb)->source);
113 }
114 
115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
116 {
117 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
118 }
119 
120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
121 {
122 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
123 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
124 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
125 	struct tcp_sock *tp = tcp_sk(sk);
126 	int ts_recent_stamp;
127 	u32 reuse_thresh;
128 
129 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
130 		reuse = 0;
131 
132 	if (reuse == 2) {
133 		/* Still does not detect *everything* that goes through
134 		 * lo, since we require a loopback src or dst address
135 		 * or direct binding to 'lo' interface.
136 		 */
137 		bool loopback = false;
138 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
139 			loopback = true;
140 #if IS_ENABLED(CONFIG_IPV6)
141 		if (tw->tw_family == AF_INET6) {
142 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
143 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
144 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
145 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
146 				loopback = true;
147 		} else
148 #endif
149 		{
150 			if (ipv4_is_loopback(tw->tw_daddr) ||
151 			    ipv4_is_loopback(tw->tw_rcv_saddr))
152 				loopback = true;
153 		}
154 		if (!loopback)
155 			reuse = 0;
156 	}
157 
158 	/* With PAWS, it is safe from the viewpoint
159 	   of data integrity. Even without PAWS it is safe provided sequence
160 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161 
162 	   Actually, the idea is close to VJ's one, only timestamp cache is
163 	   held not per host, but per port pair and TW bucket is used as state
164 	   holder.
165 
166 	   If TW bucket has been already destroyed we fall back to VJ's scheme
167 	   and use initial timestamp retrieved from peer table.
168 	 */
169 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
170 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
171 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
172 	if (ts_recent_stamp &&
173 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
174 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
175 		 * and releasing the bucket lock.
176 		 */
177 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
178 			return 0;
179 
180 		/* In case of repair and re-using TIME-WAIT sockets we still
181 		 * want to be sure that it is safe as above but honor the
182 		 * sequence numbers and time stamps set as part of the repair
183 		 * process.
184 		 *
185 		 * Without this check re-using a TIME-WAIT socket with TCP
186 		 * repair would accumulate a -1 on the repair assigned
187 		 * sequence number. The first time it is reused the sequence
188 		 * is -1, the second time -2, etc. This fixes that issue
189 		 * without appearing to create any others.
190 		 */
191 		if (likely(!tp->repair)) {
192 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
193 
194 			if (!seq)
195 				seq = 1;
196 			WRITE_ONCE(tp->write_seq, seq);
197 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
198 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
199 		}
200 
201 		return 1;
202 	}
203 
204 	return 0;
205 }
206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
207 
208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
209 			      int addr_len)
210 {
211 	/* This check is replicated from tcp_v4_connect() and intended to
212 	 * prevent BPF program called below from accessing bytes that are out
213 	 * of the bound specified by user in addr_len.
214 	 */
215 	if (addr_len < sizeof(struct sockaddr_in))
216 		return -EINVAL;
217 
218 	sock_owned_by_me(sk);
219 
220 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
221 }
222 
223 /* This will initiate an outgoing connection. */
224 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
225 {
226 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
227 	struct inet_timewait_death_row *tcp_death_row;
228 	struct inet_sock *inet = inet_sk(sk);
229 	struct tcp_sock *tp = tcp_sk(sk);
230 	struct ip_options_rcu *inet_opt;
231 	struct net *net = sock_net(sk);
232 	__be16 orig_sport, orig_dport;
233 	__be32 daddr, nexthop;
234 	struct flowi4 *fl4;
235 	struct rtable *rt;
236 	int err;
237 
238 	if (addr_len < sizeof(struct sockaddr_in))
239 		return -EINVAL;
240 
241 	if (usin->sin_family != AF_INET)
242 		return -EAFNOSUPPORT;
243 
244 	nexthop = daddr = usin->sin_addr.s_addr;
245 	inet_opt = rcu_dereference_protected(inet->inet_opt,
246 					     lockdep_sock_is_held(sk));
247 	if (inet_opt && inet_opt->opt.srr) {
248 		if (!daddr)
249 			return -EINVAL;
250 		nexthop = inet_opt->opt.faddr;
251 	}
252 
253 	orig_sport = inet->inet_sport;
254 	orig_dport = usin->sin_port;
255 	fl4 = &inet->cork.fl.u.ip4;
256 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
257 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
258 			      orig_dport, sk);
259 	if (IS_ERR(rt)) {
260 		err = PTR_ERR(rt);
261 		if (err == -ENETUNREACH)
262 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
263 		return err;
264 	}
265 
266 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
267 		ip_rt_put(rt);
268 		return -ENETUNREACH;
269 	}
270 
271 	if (!inet_opt || !inet_opt->opt.srr)
272 		daddr = fl4->daddr;
273 
274 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
275 
276 	if (!inet->inet_saddr) {
277 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
278 		if (err) {
279 			ip_rt_put(rt);
280 			return err;
281 		}
282 	} else {
283 		sk_rcv_saddr_set(sk, inet->inet_saddr);
284 	}
285 
286 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
287 		/* Reset inherited state */
288 		tp->rx_opt.ts_recent	   = 0;
289 		tp->rx_opt.ts_recent_stamp = 0;
290 		if (likely(!tp->repair))
291 			WRITE_ONCE(tp->write_seq, 0);
292 	}
293 
294 	inet->inet_dport = usin->sin_port;
295 	sk_daddr_set(sk, daddr);
296 
297 	inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
298 	if (inet_opt)
299 		inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
300 
301 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
302 
303 	/* Socket identity is still unknown (sport may be zero).
304 	 * However we set state to SYN-SENT and not releasing socket
305 	 * lock select source port, enter ourselves into the hash tables and
306 	 * complete initialization after this.
307 	 */
308 	tcp_set_state(sk, TCP_SYN_SENT);
309 	err = inet_hash_connect(tcp_death_row, sk);
310 	if (err)
311 		goto failure;
312 
313 	sk_set_txhash(sk);
314 
315 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
316 			       inet->inet_sport, inet->inet_dport, sk);
317 	if (IS_ERR(rt)) {
318 		err = PTR_ERR(rt);
319 		rt = NULL;
320 		goto failure;
321 	}
322 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
323 	/* OK, now commit destination to socket.  */
324 	sk->sk_gso_type = SKB_GSO_TCPV4;
325 	sk_setup_caps(sk, &rt->dst);
326 	rt = NULL;
327 
328 	if (likely(!tp->repair)) {
329 		if (!tp->write_seq)
330 			WRITE_ONCE(tp->write_seq,
331 				   secure_tcp_seq(inet->inet_saddr,
332 						  inet->inet_daddr,
333 						  inet->inet_sport,
334 						  usin->sin_port));
335 		WRITE_ONCE(tp->tsoffset,
336 			   secure_tcp_ts_off(net, inet->inet_saddr,
337 					     inet->inet_daddr));
338 	}
339 
340 	atomic_set(&inet->inet_id, get_random_u16());
341 
342 	if (tcp_fastopen_defer_connect(sk, &err))
343 		return err;
344 	if (err)
345 		goto failure;
346 
347 	err = tcp_connect(sk);
348 
349 	if (err)
350 		goto failure;
351 
352 	return 0;
353 
354 failure:
355 	/*
356 	 * This unhashes the socket and releases the local port,
357 	 * if necessary.
358 	 */
359 	tcp_set_state(sk, TCP_CLOSE);
360 	inet_bhash2_reset_saddr(sk);
361 	ip_rt_put(rt);
362 	sk->sk_route_caps = 0;
363 	inet->inet_dport = 0;
364 	return err;
365 }
366 EXPORT_IPV6_MOD(tcp_v4_connect);
367 
368 /*
369  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
370  * It can be called through tcp_release_cb() if socket was owned by user
371  * at the time tcp_v4_err() was called to handle ICMP message.
372  */
373 void tcp_v4_mtu_reduced(struct sock *sk)
374 {
375 	struct inet_sock *inet = inet_sk(sk);
376 	struct dst_entry *dst;
377 	u32 mtu, dmtu;
378 
379 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
380 		return;
381 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
382 	dst = inet_csk_update_pmtu(sk, mtu);
383 	if (!dst)
384 		return;
385 
386 	/* Something is about to be wrong... Remember soft error
387 	 * for the case, if this connection will not able to recover.
388 	 */
389 	dmtu = dst4_mtu(dst);
390 	if (mtu < dmtu && ip_dont_fragment(sk, dst))
391 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
392 
393 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
394 	    ip_sk_accept_pmtu(sk) &&
395 	    inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
396 		tcp_sync_mss(sk, dmtu);
397 
398 		/* Resend the TCP packet because it's
399 		 * clear that the old packet has been
400 		 * dropped. This is the new "fast" path mtu
401 		 * discovery.
402 		 */
403 		tcp_simple_retransmit(sk);
404 	} /* else let the usual retransmit timer handle it */
405 }
406 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
407 
408 static void do_redirect(struct sk_buff *skb, struct sock *sk)
409 {
410 	struct dst_entry *dst = __sk_dst_check(sk, 0);
411 
412 	if (dst)
413 		dst->ops->redirect(dst, sk, skb);
414 }
415 
416 
417 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
418 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
419 {
420 	struct request_sock *req = inet_reqsk(sk);
421 	struct net *net = sock_net(sk);
422 
423 	/* ICMPs are not backlogged, hence we cannot get
424 	 * an established socket here.
425 	 */
426 	if (seq != tcp_rsk(req)->snt_isn) {
427 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
428 	} else if (abort) {
429 		/*
430 		 * Still in SYN_RECV, just remove it silently.
431 		 * There is no good way to pass the error to the newly
432 		 * created socket, and POSIX does not want network
433 		 * errors returned from accept().
434 		 */
435 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
436 		tcp_listendrop(req->rsk_listener);
437 	}
438 	reqsk_put(req);
439 }
440 EXPORT_IPV6_MOD(tcp_req_err);
441 
442 /* TCP-LD (RFC 6069) logic */
443 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
444 {
445 	struct inet_connection_sock *icsk = inet_csk(sk);
446 	struct tcp_sock *tp = tcp_sk(sk);
447 	struct sk_buff *skb;
448 	s32 remaining;
449 	u32 delta_us;
450 
451 	if (sock_owned_by_user(sk))
452 		return;
453 
454 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
455 	    !icsk->icsk_backoff)
456 		return;
457 
458 	skb = tcp_rtx_queue_head(sk);
459 	if (WARN_ON_ONCE(!skb))
460 		return;
461 
462 	icsk->icsk_backoff--;
463 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
464 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
465 
466 	tcp_mstamp_refresh(tp);
467 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
468 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
469 
470 	if (remaining > 0) {
471 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
472 	} else {
473 		/* RTO revert clocked out retransmission.
474 		 * Will retransmit now.
475 		 */
476 		tcp_retransmit_timer(sk);
477 	}
478 }
479 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
480 
481 /*
482  * This routine is called by the ICMP module when it gets some
483  * sort of error condition.  If err < 0 then the socket should
484  * be closed and the error returned to the user.  If err > 0
485  * it's just the icmp type << 8 | icmp code.  After adjustment
486  * header points to the first 8 bytes of the tcp header.  We need
487  * to find the appropriate port.
488  *
489  * The locking strategy used here is very "optimistic". When
490  * someone else accesses the socket the ICMP is just dropped
491  * and for some paths there is no check at all.
492  * A more general error queue to queue errors for later handling
493  * is probably better.
494  *
495  */
496 
497 int tcp_v4_err(struct sk_buff *skb, u32 info)
498 {
499 	const struct iphdr *iph = (const struct iphdr *)skb->data;
500 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
501 	struct net *net = dev_net_rcu(skb->dev);
502 	const int type = icmp_hdr(skb)->type;
503 	const int code = icmp_hdr(skb)->code;
504 	struct request_sock *fastopen;
505 	struct tcp_sock *tp;
506 	u32 seq, snd_una;
507 	struct sock *sk;
508 	int err;
509 
510 	sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
511 				       ntohs(th->source), inet_iif(skb), 0);
512 	if (!sk) {
513 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
514 		return -ENOENT;
515 	}
516 	if (sk->sk_state == TCP_TIME_WAIT) {
517 		/* To increase the counter of ignored icmps for TCP-AO */
518 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
519 		inet_twsk_put(inet_twsk(sk));
520 		return 0;
521 	}
522 	seq = ntohl(th->seq);
523 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
524 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
525 				     type == ICMP_TIME_EXCEEDED ||
526 				     (type == ICMP_DEST_UNREACH &&
527 				      (code == ICMP_NET_UNREACH ||
528 				       code == ICMP_HOST_UNREACH)));
529 		return 0;
530 	}
531 
532 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
533 		sock_put(sk);
534 		return 0;
535 	}
536 
537 	bh_lock_sock(sk);
538 	/* If too many ICMPs get dropped on busy
539 	 * servers this needs to be solved differently.
540 	 * We do take care of PMTU discovery (RFC1191) special case :
541 	 * we can receive locally generated ICMP messages while socket is held.
542 	 */
543 	if (sock_owned_by_user(sk)) {
544 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
545 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
546 	}
547 	if (sk->sk_state == TCP_CLOSE)
548 		goto out;
549 
550 	if (static_branch_unlikely(&ip4_min_ttl)) {
551 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
552 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
553 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
554 			goto out;
555 		}
556 	}
557 
558 	tp = tcp_sk(sk);
559 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
560 	fastopen = rcu_dereference(tp->fastopen_rsk);
561 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
562 	if (sk->sk_state != TCP_LISTEN &&
563 	    !between(seq, snd_una, tp->snd_nxt)) {
564 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
565 		goto out;
566 	}
567 
568 	switch (type) {
569 	case ICMP_REDIRECT:
570 		if (!sock_owned_by_user(sk))
571 			do_redirect(skb, sk);
572 		goto out;
573 	case ICMP_SOURCE_QUENCH:
574 		/* Just silently ignore these. */
575 		goto out;
576 	case ICMP_PARAMETERPROB:
577 		err = EPROTO;
578 		break;
579 	case ICMP_DEST_UNREACH:
580 		if (code > NR_ICMP_UNREACH)
581 			goto out;
582 
583 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
584 			/* We are not interested in TCP_LISTEN and open_requests
585 			 * (SYN-ACKs send out by Linux are always <576bytes so
586 			 * they should go through unfragmented).
587 			 */
588 			if (sk->sk_state == TCP_LISTEN)
589 				goto out;
590 
591 			WRITE_ONCE(tp->mtu_info, info);
592 			if (!sock_owned_by_user(sk)) {
593 				tcp_v4_mtu_reduced(sk);
594 			} else {
595 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
596 					sock_hold(sk);
597 			}
598 			goto out;
599 		}
600 
601 		err = icmp_err_convert[code].errno;
602 		/* check if this ICMP message allows revert of backoff.
603 		 * (see RFC 6069)
604 		 */
605 		if (!fastopen &&
606 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
607 			tcp_ld_RTO_revert(sk, seq);
608 		break;
609 	case ICMP_TIME_EXCEEDED:
610 		err = EHOSTUNREACH;
611 		break;
612 	default:
613 		goto out;
614 	}
615 
616 	switch (sk->sk_state) {
617 	case TCP_SYN_SENT:
618 	case TCP_SYN_RECV:
619 		/* Only in fast or simultaneous open. If a fast open socket is
620 		 * already accepted it is treated as a connected one below.
621 		 */
622 		if (fastopen && !fastopen->sk)
623 			break;
624 
625 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
626 
627 		if (!sock_owned_by_user(sk))
628 			tcp_done_with_error(sk, err);
629 		else
630 			WRITE_ONCE(sk->sk_err_soft, err);
631 		goto out;
632 	}
633 
634 	/* If we've already connected we will keep trying
635 	 * until we time out, or the user gives up.
636 	 *
637 	 * rfc1122 4.2.3.9 allows to consider as hard errors
638 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
639 	 * but it is obsoleted by pmtu discovery).
640 	 *
641 	 * Note, that in modern internet, where routing is unreliable
642 	 * and in each dark corner broken firewalls sit, sending random
643 	 * errors ordered by their masters even this two messages finally lose
644 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
645 	 *
646 	 * Now we are in compliance with RFCs.
647 	 *							--ANK (980905)
648 	 */
649 
650 	if (!sock_owned_by_user(sk) &&
651 	    inet_test_bit(RECVERR, sk)) {
652 		WRITE_ONCE(sk->sk_err, err);
653 		sk_error_report(sk);
654 	} else	{ /* Only an error on timeout */
655 		WRITE_ONCE(sk->sk_err_soft, err);
656 	}
657 
658 out:
659 	bh_unlock_sock(sk);
660 	sock_put(sk);
661 	return 0;
662 }
663 
664 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
665 
666 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
667 				 const struct tcp_ao_hdr *aoh,
668 				 struct ip_reply_arg *arg, struct tcphdr *reply,
669 				 __be32 reply_options[REPLY_OPTIONS_LEN])
670 {
671 #ifdef CONFIG_TCP_AO
672 	int sdif = tcp_v4_sdif(skb);
673 	int dif = inet_iif(skb);
674 	int l3index = sdif ? dif : 0;
675 	bool allocated_traffic_key;
676 	struct tcp_ao_key *key;
677 	char *traffic_key;
678 	bool drop = true;
679 	u32 ao_sne = 0;
680 	u8 keyid;
681 
682 	rcu_read_lock();
683 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
684 				 &key, &traffic_key, &allocated_traffic_key,
685 				 &keyid, &ao_sne))
686 		goto out;
687 
688 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
689 				 (aoh->rnext_keyid << 8) | keyid);
690 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
691 	reply->doff = arg->iov[0].iov_len / 4;
692 
693 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
694 			    key, traffic_key,
695 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
696 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
697 			    reply, ao_sne))
698 		goto out;
699 	drop = false;
700 out:
701 	rcu_read_unlock();
702 	if (allocated_traffic_key)
703 		kfree(traffic_key);
704 	return drop;
705 #else
706 	return true;
707 #endif
708 }
709 
710 /*
711  *	This routine will send an RST to the other tcp.
712  *
713  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
714  *		      for reset.
715  *	Answer: if a packet caused RST, it is not for a socket
716  *		existing in our system, if it is matched to a socket,
717  *		it is just duplicate segment or bug in other side's TCP.
718  *		So that we build reply only basing on parameters
719  *		arrived with segment.
720  *	Exception: precedence violation. We do not implement it in any case.
721  */
722 
723 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
724 			      enum sk_rst_reason reason)
725 {
726 	const struct tcphdr *th = tcp_hdr(skb);
727 	struct {
728 		struct tcphdr th;
729 		__be32 opt[REPLY_OPTIONS_LEN];
730 	} rep;
731 	const __u8 *md5_hash_location = NULL;
732 	const struct tcp_ao_hdr *aoh;
733 	struct ip_reply_arg arg;
734 #ifdef CONFIG_TCP_MD5SIG
735 	struct tcp_md5sig_key *key = NULL;
736 	unsigned char newhash[16];
737 	struct sock *sk1 = NULL;
738 #endif
739 	u64 transmit_time = 0;
740 	struct sock *ctl_sk;
741 	struct net *net;
742 	u32 txhash = 0;
743 
744 	/* Never send a reset in response to a reset. */
745 	if (th->rst)
746 		return;
747 
748 	/* If sk not NULL, it means we did a successful lookup and incoming
749 	 * route had to be correct. prequeue might have dropped our dst.
750 	 */
751 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
752 		return;
753 
754 	/* Swap the send and the receive. */
755 	memset(&rep, 0, sizeof(rep));
756 	rep.th.dest   = th->source;
757 	rep.th.source = th->dest;
758 	rep.th.doff   = sizeof(struct tcphdr) / 4;
759 	rep.th.rst    = 1;
760 
761 	if (th->ack) {
762 		rep.th.seq = th->ack_seq;
763 	} else {
764 		rep.th.ack = 1;
765 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
766 				       skb->len - (th->doff << 2));
767 	}
768 
769 	memset(&arg, 0, sizeof(arg));
770 	arg.iov[0].iov_base = (unsigned char *)&rep;
771 	arg.iov[0].iov_len  = sizeof(rep.th);
772 
773 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
774 
775 	/* Invalid TCP option size or twice included auth */
776 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
777 		return;
778 
779 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
780 		return;
781 
782 #ifdef CONFIG_TCP_MD5SIG
783 	rcu_read_lock();
784 	if (sk && sk_fullsock(sk)) {
785 		const union tcp_md5_addr *addr;
786 		int l3index;
787 
788 		/* sdif set, means packet ingressed via a device
789 		 * in an L3 domain and inet_iif is set to it.
790 		 */
791 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
792 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
793 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
794 	} else if (md5_hash_location) {
795 		const union tcp_md5_addr *addr;
796 		int sdif = tcp_v4_sdif(skb);
797 		int dif = inet_iif(skb);
798 		int l3index;
799 
800 		/*
801 		 * active side is lost. Try to find listening socket through
802 		 * source port, and then find md5 key through listening socket.
803 		 * we are not loose security here:
804 		 * Incoming packet is checked with md5 hash with finding key,
805 		 * no RST generated if md5 hash doesn't match.
806 		 */
807 		sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
808 					     th->source, ip_hdr(skb)->daddr,
809 					     ntohs(th->source), dif, sdif);
810 		/* don't send rst if it can't find key */
811 		if (!sk1)
812 			goto out;
813 
814 		/* sdif set, means packet ingressed via a device
815 		 * in an L3 domain and dif is set to it.
816 		 */
817 		l3index = sdif ? dif : 0;
818 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
819 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
820 		if (!key)
821 			goto out;
822 
823 		tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
824 		if (memcmp(md5_hash_location, newhash, 16) != 0)
825 			goto out;
826 	}
827 
828 	if (key) {
829 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
830 				   (TCPOPT_NOP << 16) |
831 				   (TCPOPT_MD5SIG << 8) |
832 				   TCPOLEN_MD5SIG);
833 		/* Update length and the length the header thinks exists */
834 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
835 		rep.th.doff = arg.iov[0].iov_len / 4;
836 
837 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
838 				     key, ip_hdr(skb)->saddr,
839 				     ip_hdr(skb)->daddr, &rep.th);
840 	}
841 #endif
842 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
843 	if (rep.opt[0] == 0) {
844 		__be32 mrst = mptcp_reset_option(skb);
845 
846 		if (mrst) {
847 			rep.opt[0] = mrst;
848 			arg.iov[0].iov_len += sizeof(mrst);
849 			rep.th.doff = arg.iov[0].iov_len / 4;
850 		}
851 	}
852 
853 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
854 				      ip_hdr(skb)->saddr, /* XXX */
855 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
856 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
857 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
858 
859 	/* When socket is gone, all binding information is lost.
860 	 * routing might fail in this case. No choice here, if we choose to force
861 	 * input interface, we will misroute in case of asymmetric route.
862 	 */
863 	if (sk)
864 		arg.bound_dev_if = sk->sk_bound_dev_if;
865 
866 	trace_tcp_send_reset(sk, skb, reason);
867 
868 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
869 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
870 
871 	/* ECN bits of TW reset are cleared */
872 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
873 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
874 	local_bh_disable();
875 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
876 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
877 
878 	sock_net_set(ctl_sk, net);
879 	if (sk) {
880 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
881 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
882 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
883 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
884 		transmit_time = tcp_transmit_time(sk);
885 		xfrm_sk_clone_policy(ctl_sk, sk);
886 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
887 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
888 	} else {
889 		ctl_sk->sk_mark = 0;
890 		ctl_sk->sk_priority = 0;
891 	}
892 	ip_send_unicast_reply(ctl_sk, sk,
893 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
894 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
895 			      &arg, arg.iov[0].iov_len,
896 			      transmit_time, txhash);
897 
898 	xfrm_sk_free_policy(ctl_sk);
899 	sock_net_set(ctl_sk, &init_net);
900 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
901 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
902 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
903 	local_bh_enable();
904 
905 #ifdef CONFIG_TCP_MD5SIG
906 out:
907 	rcu_read_unlock();
908 #endif
909 }
910 
911 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
912    outside socket context is ugly, certainly. What can I do?
913  */
914 
915 static void tcp_v4_send_ack(const struct sock *sk,
916 			    struct sk_buff *skb, u32 seq, u32 ack,
917 			    u32 win, u32 tsval, u32 tsecr, int oif,
918 			    struct tcp_key *key,
919 			    int reply_flags, u8 tos, u32 txhash)
920 {
921 	const struct tcphdr *th = tcp_hdr(skb);
922 	struct {
923 		struct tcphdr th;
924 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
925 	} rep;
926 	struct net *net = sock_net(sk);
927 	struct ip_reply_arg arg;
928 	struct sock *ctl_sk;
929 	u64 transmit_time;
930 
931 	memset(&rep.th, 0, sizeof(struct tcphdr));
932 	memset(&arg, 0, sizeof(arg));
933 
934 	arg.iov[0].iov_base = (unsigned char *)&rep;
935 	arg.iov[0].iov_len  = sizeof(rep.th);
936 	if (tsecr) {
937 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
938 				   (TCPOPT_TIMESTAMP << 8) |
939 				   TCPOLEN_TIMESTAMP);
940 		rep.opt[1] = htonl(tsval);
941 		rep.opt[2] = htonl(tsecr);
942 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
943 	}
944 
945 	/* Swap the send and the receive. */
946 	rep.th.dest    = th->source;
947 	rep.th.source  = th->dest;
948 	rep.th.doff    = arg.iov[0].iov_len / 4;
949 	rep.th.seq     = htonl(seq);
950 	rep.th.ack_seq = htonl(ack);
951 	rep.th.ack     = 1;
952 	rep.th.window  = htons(win);
953 
954 #ifdef CONFIG_TCP_MD5SIG
955 	if (tcp_key_is_md5(key)) {
956 		int offset = (tsecr) ? 3 : 0;
957 
958 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
959 					  (TCPOPT_NOP << 16) |
960 					  (TCPOPT_MD5SIG << 8) |
961 					  TCPOLEN_MD5SIG);
962 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
963 		rep.th.doff = arg.iov[0].iov_len/4;
964 
965 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
966 				    key->md5_key, ip_hdr(skb)->saddr,
967 				    ip_hdr(skb)->daddr, &rep.th);
968 	}
969 #endif
970 #ifdef CONFIG_TCP_AO
971 	if (tcp_key_is_ao(key)) {
972 		int offset = (tsecr) ? 3 : 0;
973 
974 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
975 					  (tcp_ao_len(key->ao_key) << 16) |
976 					  (key->ao_key->sndid << 8) |
977 					  key->rcv_next);
978 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
979 		rep.th.doff = arg.iov[0].iov_len / 4;
980 
981 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
982 				key->ao_key, key->traffic_key,
983 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
984 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
985 				&rep.th, key->sne);
986 	}
987 #endif
988 	arg.flags = reply_flags;
989 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
990 				      ip_hdr(skb)->saddr, /* XXX */
991 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
992 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
993 	if (oif)
994 		arg.bound_dev_if = oif;
995 	arg.tos = tos;
996 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
997 	local_bh_disable();
998 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
999 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1000 	sock_net_set(ctl_sk, net);
1001 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1002 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1003 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1004 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1005 	transmit_time = tcp_transmit_time(sk);
1006 	ip_send_unicast_reply(ctl_sk, sk,
1007 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1008 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1009 			      &arg, arg.iov[0].iov_len,
1010 			      transmit_time, txhash);
1011 
1012 	sock_net_set(ctl_sk, &init_net);
1013 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1014 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1015 	local_bh_enable();
1016 }
1017 
1018 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1019 				enum tcp_tw_status tw_status)
1020 {
1021 	struct inet_timewait_sock *tw = inet_twsk(sk);
1022 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1023 	struct tcp_key key = {};
1024 	u8 tos = tw->tw_tos;
1025 
1026 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1027 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1028 	 * being placed in a different service queues (Classic rather than L4S)
1029 	 */
1030 	if (tw_status == TCP_TW_ACK_OOW)
1031 		tos &= ~INET_ECN_MASK;
1032 
1033 #ifdef CONFIG_TCP_AO
1034 	struct tcp_ao_info *ao_info;
1035 
1036 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1037 		/* FIXME: the segment to-be-acked is not verified yet */
1038 		ao_info = rcu_dereference(tcptw->ao_info);
1039 		if (ao_info) {
1040 			const struct tcp_ao_hdr *aoh;
1041 
1042 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1043 				inet_twsk_put(tw);
1044 				return;
1045 			}
1046 
1047 			if (aoh)
1048 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1049 								    aoh->rnext_keyid, -1);
1050 		}
1051 	}
1052 	if (key.ao_key) {
1053 		struct tcp_ao_key *rnext_key;
1054 
1055 		key.traffic_key = snd_other_key(key.ao_key);
1056 		key.sne = READ_ONCE(ao_info->snd_sne);
1057 		rnext_key = READ_ONCE(ao_info->rnext_key);
1058 		key.rcv_next = rnext_key->rcvid;
1059 		key.type = TCP_KEY_AO;
1060 #else
1061 	if (0) {
1062 #endif
1063 	} else if (static_branch_tcp_md5()) {
1064 		key.md5_key = tcp_twsk_md5_key(tcptw);
1065 		if (key.md5_key)
1066 			key.type = TCP_KEY_MD5;
1067 	}
1068 
1069 	tcp_v4_send_ack(sk, skb,
1070 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1071 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1072 			tcp_tw_tsval(tcptw),
1073 			READ_ONCE(tcptw->tw_ts_recent),
1074 			tw->tw_bound_dev_if, &key,
1075 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1076 			tos,
1077 			tw->tw_txhash);
1078 
1079 	inet_twsk_put(tw);
1080 }
1081 
1082 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1083 				  struct request_sock *req)
1084 {
1085 	struct tcp_key key = {};
1086 
1087 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1088 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1089 	 */
1090 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1091 					     tcp_sk(sk)->snd_nxt;
1092 
1093 #ifdef CONFIG_TCP_AO
1094 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1095 	    tcp_rsk_used_ao(req)) {
1096 		const union tcp_md5_addr *addr;
1097 		const struct tcp_ao_hdr *aoh;
1098 		int l3index;
1099 
1100 		/* Invalid TCP option size or twice included auth */
1101 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1102 			return;
1103 		if (!aoh)
1104 			return;
1105 
1106 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1107 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1108 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1109 					      aoh->rnext_keyid, -1);
1110 		if (unlikely(!key.ao_key)) {
1111 			/* Send ACK with any matching MKT for the peer */
1112 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1113 			/* Matching key disappeared (user removed the key?)
1114 			 * let the handshake timeout.
1115 			 */
1116 			if (!key.ao_key) {
1117 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1118 						     addr,
1119 						     ntohs(tcp_hdr(skb)->source),
1120 						     &ip_hdr(skb)->daddr,
1121 						     ntohs(tcp_hdr(skb)->dest));
1122 				return;
1123 			}
1124 		}
1125 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1126 		if (!key.traffic_key)
1127 			return;
1128 
1129 		key.type = TCP_KEY_AO;
1130 		key.rcv_next = aoh->keyid;
1131 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1132 #else
1133 	if (0) {
1134 #endif
1135 	} else if (static_branch_tcp_md5()) {
1136 		const union tcp_md5_addr *addr;
1137 		int l3index;
1138 
1139 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1140 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1141 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1142 		if (key.md5_key)
1143 			key.type = TCP_KEY_MD5;
1144 	}
1145 
1146 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1147 	tcp_v4_send_ack(sk, skb, seq,
1148 			tcp_rsk(req)->rcv_nxt,
1149 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1150 			tcp_rsk_tsval(tcp_rsk(req)),
1151 			req->ts_recent,
1152 			0, &key,
1153 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1154 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1155 			READ_ONCE(tcp_rsk(req)->txhash));
1156 	if (tcp_key_is_ao(&key))
1157 		kfree(key.traffic_key);
1158 }
1159 
1160 /*
1161  *	Send a SYN-ACK after having received a SYN.
1162  *	This still operates on a request_sock only, not on a big
1163  *	socket.
1164  */
1165 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1166 			      struct flowi *fl,
1167 			      struct request_sock *req,
1168 			      struct tcp_fastopen_cookie *foc,
1169 			      enum tcp_synack_type synack_type,
1170 			      struct sk_buff *syn_skb)
1171 {
1172 	struct inet_request_sock *ireq = inet_rsk(req);
1173 	struct flowi4 fl4;
1174 	int err = -1;
1175 	struct sk_buff *skb;
1176 	u8 tos;
1177 
1178 	/* First, grab a route. */
1179 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1180 		return -1;
1181 
1182 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1183 
1184 	if (skb) {
1185 		tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1186 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1187 
1188 		tos = READ_ONCE(inet_sk(sk)->tos);
1189 
1190 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1191 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1192 			      (tos & INET_ECN_MASK);
1193 
1194 		if (!INET_ECN_is_capable(tos) &&
1195 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1196 			tos |= INET_ECN_ECT_0;
1197 
1198 		rcu_read_lock();
1199 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1200 					    ireq->ir_rmt_addr,
1201 					    rcu_dereference(ireq->ireq_opt),
1202 					    tos);
1203 		rcu_read_unlock();
1204 		err = net_xmit_eval(err);
1205 	}
1206 
1207 	return err;
1208 }
1209 
1210 /*
1211  *	IPv4 request_sock destructor.
1212  */
1213 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1214 {
1215 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1216 }
1217 
1218 #ifdef CONFIG_TCP_MD5SIG
1219 /*
1220  * RFC2385 MD5 checksumming requires a mapping of
1221  * IP address->MD5 Key.
1222  * We need to maintain these in the sk structure.
1223  */
1224 
1225 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1226 EXPORT_IPV6_MOD(tcp_md5_needed);
1227 
1228 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1229 {
1230 	if (!old)
1231 		return true;
1232 
1233 	/* l3index always overrides non-l3index */
1234 	if (old->l3index && new->l3index == 0)
1235 		return false;
1236 	if (old->l3index == 0 && new->l3index)
1237 		return true;
1238 
1239 	return old->prefixlen < new->prefixlen;
1240 }
1241 
1242 /* Find the Key structure for an address.  */
1243 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1244 					   const union tcp_md5_addr *addr,
1245 					   int family, bool any_l3index)
1246 {
1247 	const struct tcp_sock *tp = tcp_sk(sk);
1248 	struct tcp_md5sig_key *key;
1249 	const struct tcp_md5sig_info *md5sig;
1250 	__be32 mask;
1251 	struct tcp_md5sig_key *best_match = NULL;
1252 	bool match;
1253 
1254 	/* caller either holds rcu_read_lock() or socket lock */
1255 	md5sig = rcu_dereference_check(tp->md5sig_info,
1256 				       lockdep_sock_is_held(sk));
1257 	if (!md5sig)
1258 		return NULL;
1259 
1260 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1261 				 lockdep_sock_is_held(sk)) {
1262 		if (key->family != family)
1263 			continue;
1264 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1265 		    key->l3index != l3index)
1266 			continue;
1267 		if (family == AF_INET) {
1268 			mask = inet_make_mask(key->prefixlen);
1269 			match = (key->addr.a4.s_addr & mask) ==
1270 				(addr->a4.s_addr & mask);
1271 #if IS_ENABLED(CONFIG_IPV6)
1272 		} else if (family == AF_INET6) {
1273 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1274 						  key->prefixlen);
1275 #endif
1276 		} else {
1277 			match = false;
1278 		}
1279 
1280 		if (match && better_md5_match(best_match, key))
1281 			best_match = key;
1282 	}
1283 	return best_match;
1284 }
1285 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1286 
1287 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1288 						      const union tcp_md5_addr *addr,
1289 						      int family, u8 prefixlen,
1290 						      int l3index, u8 flags)
1291 {
1292 	const struct tcp_sock *tp = tcp_sk(sk);
1293 	struct tcp_md5sig_key *key;
1294 	unsigned int size = sizeof(struct in_addr);
1295 	const struct tcp_md5sig_info *md5sig;
1296 
1297 	/* caller either holds rcu_read_lock() or socket lock */
1298 	md5sig = rcu_dereference_check(tp->md5sig_info,
1299 				       lockdep_sock_is_held(sk));
1300 	if (!md5sig)
1301 		return NULL;
1302 #if IS_ENABLED(CONFIG_IPV6)
1303 	if (family == AF_INET6)
1304 		size = sizeof(struct in6_addr);
1305 #endif
1306 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1307 				 lockdep_sock_is_held(sk)) {
1308 		if (key->family != family)
1309 			continue;
1310 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1311 			continue;
1312 		if (key->l3index != l3index)
1313 			continue;
1314 		if (!memcmp(&key->addr, addr, size) &&
1315 		    key->prefixlen == prefixlen)
1316 			return key;
1317 	}
1318 	return NULL;
1319 }
1320 
1321 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1322 					 const struct sock *addr_sk)
1323 {
1324 	const union tcp_md5_addr *addr;
1325 	int l3index;
1326 
1327 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1328 						 addr_sk->sk_bound_dev_if);
1329 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1330 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1331 }
1332 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1333 
1334 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1335 {
1336 	struct tcp_sock *tp = tcp_sk(sk);
1337 	struct tcp_md5sig_info *md5sig;
1338 
1339 	md5sig = kmalloc_obj(*md5sig, gfp);
1340 	if (!md5sig)
1341 		return -ENOMEM;
1342 
1343 	sk_gso_disable(sk);
1344 	INIT_HLIST_HEAD(&md5sig->head);
1345 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1346 	return 0;
1347 }
1348 
1349 /* This can be called on a newly created socket, from other files */
1350 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1351 			    int family, u8 prefixlen, int l3index, u8 flags,
1352 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1353 {
1354 	/* Add Key to the list */
1355 	struct tcp_md5sig_key *key;
1356 	struct tcp_sock *tp = tcp_sk(sk);
1357 	struct tcp_md5sig_info *md5sig;
1358 
1359 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1360 	if (key) {
1361 		/* Pre-existing entry - just update that one.
1362 		 * Note that the key might be used concurrently.
1363 		 * data_race() is telling kcsan that we do not care of
1364 		 * key mismatches, since changing MD5 key on live flows
1365 		 * can lead to packet drops.
1366 		 */
1367 		data_race(memcpy(key->key, newkey, newkeylen));
1368 
1369 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1370 		 * Also note that a reader could catch new key->keylen value
1371 		 * but old key->key[], this is the reason we use __GFP_ZERO
1372 		 * at sock_kmalloc() time below these lines.
1373 		 */
1374 		WRITE_ONCE(key->keylen, newkeylen);
1375 
1376 		return 0;
1377 	}
1378 
1379 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1380 					   lockdep_sock_is_held(sk));
1381 
1382 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1383 	if (!key)
1384 		return -ENOMEM;
1385 
1386 	memcpy(key->key, newkey, newkeylen);
1387 	key->keylen = newkeylen;
1388 	key->family = family;
1389 	key->prefixlen = prefixlen;
1390 	key->l3index = l3index;
1391 	key->flags = flags;
1392 	memcpy(&key->addr, addr,
1393 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1394 								 sizeof(struct in_addr));
1395 	hlist_add_head_rcu(&key->node, &md5sig->head);
1396 	return 0;
1397 }
1398 
1399 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1400 		   int family, u8 prefixlen, int l3index, u8 flags,
1401 		   const u8 *newkey, u8 newkeylen)
1402 {
1403 	struct tcp_sock *tp = tcp_sk(sk);
1404 
1405 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1406 		if (fips_enabled) {
1407 			pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1408 			return -EOPNOTSUPP;
1409 		}
1410 
1411 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1412 			return -ENOMEM;
1413 
1414 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1415 			struct tcp_md5sig_info *md5sig;
1416 
1417 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1418 			rcu_assign_pointer(tp->md5sig_info, NULL);
1419 			kfree_rcu(md5sig, rcu);
1420 			return -EUSERS;
1421 		}
1422 	}
1423 
1424 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1425 				newkey, newkeylen, GFP_KERNEL);
1426 }
1427 EXPORT_IPV6_MOD(tcp_md5_do_add);
1428 
1429 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1430 		     int family, u8 prefixlen, int l3index,
1431 		     struct tcp_md5sig_key *key)
1432 {
1433 	struct tcp_sock *tp = tcp_sk(sk);
1434 
1435 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1436 
1437 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1438 			return -ENOMEM;
1439 
1440 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1441 			struct tcp_md5sig_info *md5sig;
1442 
1443 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1444 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1445 			rcu_assign_pointer(tp->md5sig_info, NULL);
1446 			kfree_rcu(md5sig, rcu);
1447 			return -EUSERS;
1448 		}
1449 	}
1450 
1451 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1452 				key->flags, key->key, key->keylen,
1453 				sk_gfp_mask(sk, GFP_ATOMIC));
1454 }
1455 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1456 
1457 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1458 		   u8 prefixlen, int l3index, u8 flags)
1459 {
1460 	struct tcp_md5sig_key *key;
1461 
1462 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1463 	if (!key)
1464 		return -ENOENT;
1465 	hlist_del_rcu(&key->node);
1466 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1467 	kfree_rcu(key, rcu);
1468 	return 0;
1469 }
1470 EXPORT_IPV6_MOD(tcp_md5_do_del);
1471 
1472 void tcp_clear_md5_list(struct sock *sk)
1473 {
1474 	struct tcp_sock *tp = tcp_sk(sk);
1475 	struct tcp_md5sig_key *key;
1476 	struct hlist_node *n;
1477 	struct tcp_md5sig_info *md5sig;
1478 
1479 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1480 
1481 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1482 		hlist_del(&key->node);
1483 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1484 		kfree(key);
1485 	}
1486 }
1487 
1488 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1489 				 sockptr_t optval, int optlen)
1490 {
1491 	struct tcp_md5sig cmd;
1492 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1493 	const union tcp_md5_addr *addr;
1494 	u8 prefixlen = 32;
1495 	int l3index = 0;
1496 	bool l3flag;
1497 	u8 flags;
1498 
1499 	if (optlen < sizeof(cmd))
1500 		return -EINVAL;
1501 
1502 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1503 		return -EFAULT;
1504 
1505 	if (sin->sin_family != AF_INET)
1506 		return -EINVAL;
1507 
1508 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1509 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1510 
1511 	if (optname == TCP_MD5SIG_EXT &&
1512 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1513 		prefixlen = cmd.tcpm_prefixlen;
1514 		if (prefixlen > 32)
1515 			return -EINVAL;
1516 	}
1517 
1518 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1519 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1520 		struct net_device *dev;
1521 
1522 		rcu_read_lock();
1523 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1524 		if (dev && netif_is_l3_master(dev))
1525 			l3index = dev->ifindex;
1526 
1527 		rcu_read_unlock();
1528 
1529 		/* ok to reference set/not set outside of rcu;
1530 		 * right now device MUST be an L3 master
1531 		 */
1532 		if (!dev || !l3index)
1533 			return -EINVAL;
1534 	}
1535 
1536 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1537 
1538 	if (!cmd.tcpm_keylen)
1539 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1540 
1541 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1542 		return -EINVAL;
1543 
1544 	/* Don't allow keys for peers that have a matching TCP-AO key.
1545 	 * See the comment in tcp_ao_add_cmd()
1546 	 */
1547 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1548 		return -EKEYREJECTED;
1549 
1550 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1551 			      cmd.tcpm_key, cmd.tcpm_keylen);
1552 }
1553 
1554 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1555 				    __be32 daddr, __be32 saddr,
1556 				    const struct tcphdr *th, int nbytes)
1557 {
1558 	struct {
1559 		struct tcp4_pseudohdr ip;
1560 		struct tcphdr tcp;
1561 	} h;
1562 
1563 	h.ip.saddr = saddr;
1564 	h.ip.daddr = daddr;
1565 	h.ip.pad = 0;
1566 	h.ip.protocol = IPPROTO_TCP;
1567 	h.ip.len = cpu_to_be16(nbytes);
1568 	h.tcp = *th;
1569 	h.tcp.check = 0;
1570 	md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
1571 }
1572 
1573 static noinline_for_stack void
1574 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1575 		    __be32 daddr, __be32 saddr, const struct tcphdr *th)
1576 {
1577 	struct md5_ctx ctx;
1578 
1579 	md5_init(&ctx);
1580 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
1581 	tcp_md5_hash_key(&ctx, key);
1582 	md5_final(&ctx, md5_hash);
1583 }
1584 
1585 noinline_for_stack void
1586 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1587 		    const struct sock *sk, const struct sk_buff *skb)
1588 {
1589 	const struct tcphdr *th = tcp_hdr(skb);
1590 	__be32 saddr, daddr;
1591 	struct md5_ctx ctx;
1592 
1593 	if (sk) { /* valid for establish/request sockets */
1594 		saddr = sk->sk_rcv_saddr;
1595 		daddr = sk->sk_daddr;
1596 	} else {
1597 		const struct iphdr *iph = ip_hdr(skb);
1598 		saddr = iph->saddr;
1599 		daddr = iph->daddr;
1600 	}
1601 
1602 	md5_init(&ctx);
1603 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
1604 	tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
1605 	tcp_md5_hash_key(&ctx, key);
1606 	md5_final(&ctx, md5_hash);
1607 }
1608 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1609 
1610 #endif
1611 
1612 static void tcp_v4_init_req(struct request_sock *req,
1613 			    const struct sock *sk_listener,
1614 			    struct sk_buff *skb)
1615 {
1616 	struct inet_request_sock *ireq = inet_rsk(req);
1617 	struct net *net = sock_net(sk_listener);
1618 
1619 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1620 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1621 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1622 }
1623 
1624 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1625 					  struct sk_buff *skb,
1626 					  struct flowi *fl,
1627 					  struct request_sock *req,
1628 					  u32 tw_isn)
1629 {
1630 	tcp_v4_init_req(req, sk, skb);
1631 
1632 	if (security_inet_conn_request(sk, skb, req))
1633 		return NULL;
1634 
1635 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1636 }
1637 
1638 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1639 	.family		=	PF_INET,
1640 	.obj_size	=	sizeof(struct tcp_request_sock),
1641 	.send_ack	=	tcp_v4_reqsk_send_ack,
1642 	.destructor	=	tcp_v4_reqsk_destructor,
1643 	.send_reset	=	tcp_v4_send_reset,
1644 };
1645 
1646 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1647 	.mss_clamp	=	TCP_MSS_DEFAULT,
1648 #ifdef CONFIG_TCP_MD5SIG
1649 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1650 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1651 #endif
1652 #ifdef CONFIG_TCP_AO
1653 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1654 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1655 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1656 #endif
1657 #ifdef CONFIG_SYN_COOKIES
1658 	.cookie_init_seq =	cookie_v4_init_sequence,
1659 #endif
1660 	.route_req	=	tcp_v4_route_req,
1661 	.init_seq	=	tcp_v4_init_seq,
1662 	.init_ts_off	=	tcp_v4_init_ts_off,
1663 	.send_synack	=	tcp_v4_send_synack,
1664 };
1665 
1666 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1667 {
1668 	/* Never answer to SYNs send to broadcast or multicast */
1669 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1670 		goto drop;
1671 
1672 	return tcp_conn_request(&tcp_request_sock_ops,
1673 				&tcp_request_sock_ipv4_ops, sk, skb);
1674 
1675 drop:
1676 	tcp_listendrop(sk);
1677 	return 0;
1678 }
1679 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1680 
1681 
1682 /*
1683  * The three way handshake has completed - we got a valid synack -
1684  * now create the new socket.
1685  */
1686 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1687 				  struct request_sock *req,
1688 				  struct dst_entry *dst,
1689 				  struct request_sock *req_unhash,
1690 				  bool *own_req,
1691 				  void (*opt_child_init)(struct sock *newsk,
1692 							 const struct sock *sk))
1693 {
1694 	struct inet_request_sock *ireq;
1695 	bool found_dup_sk = false;
1696 	struct inet_sock *newinet;
1697 	struct tcp_sock *newtp;
1698 	struct sock *newsk;
1699 #ifdef CONFIG_TCP_MD5SIG
1700 	const union tcp_md5_addr *addr;
1701 	struct tcp_md5sig_key *key;
1702 	int l3index;
1703 #endif
1704 	struct ip_options_rcu *inet_opt;
1705 
1706 	if (sk_acceptq_is_full(sk))
1707 		goto exit_overflow;
1708 
1709 	newsk = tcp_create_openreq_child(sk, req, skb);
1710 	if (!newsk)
1711 		goto exit_nonewsk;
1712 
1713 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1714 	inet_sk_rx_dst_set(newsk, skb);
1715 
1716 	newtp		      = tcp_sk(newsk);
1717 	newinet		      = inet_sk(newsk);
1718 	ireq		      = inet_rsk(req);
1719 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1720 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1721 	newinet->mc_index     = inet_iif(skb);
1722 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1723 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1724 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1725 	if (inet_opt)
1726 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1727 	atomic_set(&newinet->inet_id, get_random_u16());
1728 
1729 	/* Set ToS of the new socket based upon the value of incoming SYN.
1730 	 * ECT bits are set later in tcp_init_transfer().
1731 	 */
1732 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1733 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1734 
1735 	if (!dst) {
1736 		dst = inet_csk_route_child_sock(sk, newsk, req);
1737 		if (!dst)
1738 			goto put_and_exit;
1739 	} else {
1740 		/* syncookie case : see end of cookie_v4_check() */
1741 	}
1742 	sk_setup_caps(newsk, dst);
1743 
1744 #if IS_ENABLED(CONFIG_IPV6)
1745 	if (opt_child_init)
1746 		opt_child_init(newsk, sk);
1747 #endif
1748 	tcp_ca_openreq_child(newsk, dst);
1749 
1750 	tcp_sync_mss(newsk, dst4_mtu(dst));
1751 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1752 
1753 	tcp_initialize_rcv_mss(newsk);
1754 
1755 #ifdef CONFIG_TCP_MD5SIG
1756 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1757 	/* Copy over the MD5 key from the original socket */
1758 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1759 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1760 	if (key && !tcp_rsk_used_ao(req)) {
1761 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1762 			goto put_and_exit;
1763 		sk_gso_disable(newsk);
1764 	}
1765 #endif
1766 #ifdef CONFIG_TCP_AO
1767 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1768 		goto put_and_exit; /* OOM, release back memory */
1769 #endif
1770 
1771 	if (__inet_inherit_port(sk, newsk) < 0)
1772 		goto put_and_exit;
1773 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1774 				       &found_dup_sk);
1775 	if (likely(*own_req)) {
1776 		tcp_move_syn(newtp, req);
1777 		ireq->ireq_opt = NULL;
1778 	} else {
1779 		newinet->inet_opt = NULL;
1780 
1781 		if (!req_unhash && found_dup_sk) {
1782 			/* This code path should only be executed in the
1783 			 * syncookie case only
1784 			 */
1785 			bh_unlock_sock(newsk);
1786 			sock_put(newsk);
1787 			newsk = NULL;
1788 		}
1789 	}
1790 	return newsk;
1791 
1792 exit_overflow:
1793 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1794 exit_nonewsk:
1795 	dst_release(dst);
1796 exit:
1797 	tcp_listendrop(sk);
1798 	return NULL;
1799 put_and_exit:
1800 	newinet->inet_opt = NULL;
1801 	inet_csk_prepare_forced_close(newsk);
1802 	tcp_done(newsk);
1803 	goto exit;
1804 }
1805 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1806 
1807 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1808 {
1809 #ifdef CONFIG_SYN_COOKIES
1810 	const struct tcphdr *th = tcp_hdr(skb);
1811 
1812 	if (!th->syn)
1813 		sk = cookie_v4_check(sk, skb);
1814 #endif
1815 	return sk;
1816 }
1817 
1818 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1819 			 struct tcphdr *th, u32 *cookie)
1820 {
1821 	u16 mss = 0;
1822 #ifdef CONFIG_SYN_COOKIES
1823 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1824 				    &tcp_request_sock_ipv4_ops, sk, th);
1825 	if (mss) {
1826 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1827 		tcp_synq_overflow(sk);
1828 	}
1829 #endif
1830 	return mss;
1831 }
1832 
1833 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1834 							   u32));
1835 /* The socket must have it's spinlock held when we get
1836  * here, unless it is a TCP_LISTEN socket.
1837  *
1838  * We have a potential double-lock case here, so even when
1839  * doing backlog processing we use the BH locking scheme.
1840  * This is because we cannot sleep with the original spinlock
1841  * held.
1842  */
1843 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1844 {
1845 	enum skb_drop_reason reason;
1846 	struct sock *rsk;
1847 
1848 	reason = psp_sk_rx_policy_check(sk, skb);
1849 	if (reason)
1850 		goto err_discard;
1851 
1852 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1853 		struct dst_entry *dst;
1854 
1855 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1856 						lockdep_sock_is_held(sk));
1857 
1858 		sock_rps_save_rxhash(sk, skb);
1859 		sk_mark_napi_id(sk, skb);
1860 		if (dst) {
1861 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1862 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1863 					     dst, 0)) {
1864 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1865 				dst_release(dst);
1866 			}
1867 		}
1868 		tcp_rcv_established(sk, skb);
1869 		return 0;
1870 	}
1871 
1872 	if (tcp_checksum_complete(skb))
1873 		goto csum_err;
1874 
1875 	if (sk->sk_state == TCP_LISTEN) {
1876 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1877 
1878 		if (!nsk)
1879 			return 0;
1880 		if (nsk != sk) {
1881 			reason = tcp_child_process(sk, nsk, skb);
1882 			if (reason) {
1883 				rsk = nsk;
1884 				goto reset;
1885 			}
1886 			return 0;
1887 		}
1888 	} else
1889 		sock_rps_save_rxhash(sk, skb);
1890 
1891 	reason = tcp_rcv_state_process(sk, skb);
1892 	if (reason) {
1893 		rsk = sk;
1894 		goto reset;
1895 	}
1896 	return 0;
1897 
1898 reset:
1899 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1900 discard:
1901 	sk_skb_reason_drop(sk, skb, reason);
1902 	/* Be careful here. If this function gets more complicated and
1903 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1904 	 * might be destroyed here. This current version compiles correctly,
1905 	 * but you have been warned.
1906 	 */
1907 	return 0;
1908 
1909 csum_err:
1910 	reason = SKB_DROP_REASON_TCP_CSUM;
1911 	trace_tcp_bad_csum(skb);
1912 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1913 err_discard:
1914 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1915 	goto discard;
1916 }
1917 EXPORT_SYMBOL(tcp_v4_do_rcv);
1918 
1919 int tcp_v4_early_demux(struct sk_buff *skb)
1920 {
1921 	struct net *net = dev_net_rcu(skb->dev);
1922 	const struct iphdr *iph;
1923 	const struct tcphdr *th;
1924 	struct sock *sk;
1925 
1926 	if (skb->pkt_type != PACKET_HOST)
1927 		return 0;
1928 
1929 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1930 		return 0;
1931 
1932 	iph = ip_hdr(skb);
1933 	th = tcp_hdr(skb);
1934 
1935 	if (th->doff < sizeof(struct tcphdr) / 4)
1936 		return 0;
1937 
1938 	sk = __inet_lookup_established(net, iph->saddr, th->source,
1939 				       iph->daddr, ntohs(th->dest),
1940 				       skb->skb_iif, inet_sdif(skb));
1941 	if (sk) {
1942 		skb->sk = sk;
1943 		skb->destructor = sock_edemux;
1944 		if (sk_fullsock(sk)) {
1945 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1946 
1947 			if (dst)
1948 				dst = dst_check(dst, 0);
1949 			if (dst &&
1950 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1951 				skb_dst_set_noref(skb, dst);
1952 		}
1953 	}
1954 	return 0;
1955 }
1956 
1957 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1958 		     enum skb_drop_reason *reason)
1959 {
1960 	u32 tail_gso_size, tail_gso_segs;
1961 	struct skb_shared_info *shinfo;
1962 	const struct tcphdr *th;
1963 	struct tcphdr *thtail;
1964 	struct sk_buff *tail;
1965 	unsigned int hdrlen;
1966 	bool fragstolen;
1967 	u32 gso_segs;
1968 	u32 gso_size;
1969 	u64 limit;
1970 	int delta;
1971 	int err;
1972 
1973 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1974 	 * we can fix skb->truesize to its real value to avoid future drops.
1975 	 * This is valid because skb is not yet charged to the socket.
1976 	 * It has been noticed pure SACK packets were sometimes dropped
1977 	 * (if cooked by drivers without copybreak feature).
1978 	 */
1979 	skb_condense(skb);
1980 
1981 	tcp_cleanup_skb(skb);
1982 
1983 	if (unlikely(tcp_checksum_complete(skb))) {
1984 		bh_unlock_sock(sk);
1985 		trace_tcp_bad_csum(skb);
1986 		*reason = SKB_DROP_REASON_TCP_CSUM;
1987 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1988 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1989 		return true;
1990 	}
1991 
1992 	/* Attempt coalescing to last skb in backlog, even if we are
1993 	 * above the limits.
1994 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1995 	 */
1996 	th = (const struct tcphdr *)skb->data;
1997 	hdrlen = th->doff * 4;
1998 
1999 	tail = sk->sk_backlog.tail;
2000 	if (!tail)
2001 		goto no_coalesce;
2002 	thtail = (struct tcphdr *)tail->data;
2003 
2004 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2005 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2006 	    ((TCP_SKB_CB(tail)->tcp_flags |
2007 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2008 	    !((TCP_SKB_CB(tail)->tcp_flags &
2009 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2010 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2011 	      TCP_SKB_CB(skb)->tcp_flags) &
2012 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2013 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2014 	    thtail->doff != th->doff ||
2015 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
2016 	    /* prior to PSP Rx policy check, retain exact PSP metadata */
2017 	    psp_skb_coalesce_diff(tail, skb))
2018 		goto no_coalesce;
2019 
2020 	__skb_pull(skb, hdrlen);
2021 
2022 	shinfo = skb_shinfo(skb);
2023 	gso_size = shinfo->gso_size ?: skb->len;
2024 	gso_segs = shinfo->gso_segs ?: 1;
2025 
2026 	shinfo = skb_shinfo(tail);
2027 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2028 	tail_gso_segs = shinfo->gso_segs ?: 1;
2029 
2030 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2031 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2032 
2033 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2034 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2035 			thtail->window = th->window;
2036 		}
2037 
2038 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2039 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2040 		 * is not entered if we append a packet with a FIN.
2041 		 * SYN, RST, URG are not present.
2042 		 * ACK is set on both packets.
2043 		 * PSH : we do not really care in TCP stack,
2044 		 *       at least for 'GRO' packets.
2045 		 */
2046 		thtail->fin |= th->fin;
2047 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2048 
2049 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2050 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2051 			tail->tstamp = skb->tstamp;
2052 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2053 		}
2054 
2055 		/* Not as strict as GRO. We only need to carry mss max value */
2056 		shinfo->gso_size = max(gso_size, tail_gso_size);
2057 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2058 
2059 		sk->sk_backlog.len += delta;
2060 		__NET_INC_STATS(sock_net(sk),
2061 				LINUX_MIB_TCPBACKLOGCOALESCE);
2062 		kfree_skb_partial(skb, fragstolen);
2063 		return false;
2064 	}
2065 	__skb_push(skb, hdrlen);
2066 
2067 no_coalesce:
2068 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2069 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2070 	 * sk_rcvbuf in normal conditions.
2071 	 */
2072 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2073 
2074 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2075 
2076 	/* Only socket owner can try to collapse/prune rx queues
2077 	 * to reduce memory overhead, so add a little headroom here.
2078 	 * Few sockets backlog are possibly concurrently non empty.
2079 	 */
2080 	limit += 64 * 1024;
2081 
2082 	limit = min_t(u64, limit, UINT_MAX);
2083 
2084 	err = sk_add_backlog(sk, skb, limit);
2085 	if (unlikely(err)) {
2086 		bh_unlock_sock(sk);
2087 		if (err == -ENOMEM) {
2088 			*reason = SKB_DROP_REASON_PFMEMALLOC;
2089 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2090 		} else {
2091 			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2092 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2093 		}
2094 		return true;
2095 	}
2096 	return false;
2097 }
2098 EXPORT_IPV6_MOD(tcp_add_backlog);
2099 
2100 static void tcp_v4_restore_cb(struct sk_buff *skb)
2101 {
2102 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2103 		sizeof(struct inet_skb_parm));
2104 }
2105 
2106 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2107 			   const struct tcphdr *th)
2108 {
2109 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2110 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2111 	 */
2112 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2113 		sizeof(struct inet_skb_parm));
2114 	barrier();
2115 
2116 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2117 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2118 				    skb->len - th->doff * 4);
2119 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2120 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2121 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2122 	TCP_SKB_CB(skb)->sacked	 = 0;
2123 	TCP_SKB_CB(skb)->has_rxtstamp =
2124 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2125 }
2126 
2127 /*
2128  *	From tcp_input.c
2129  */
2130 
2131 int tcp_v4_rcv(struct sk_buff *skb)
2132 {
2133 	struct net *net = dev_net_rcu(skb->dev);
2134 	enum skb_drop_reason drop_reason;
2135 	enum tcp_tw_status tw_status;
2136 	int sdif = inet_sdif(skb);
2137 	int dif = inet_iif(skb);
2138 	const struct iphdr *iph;
2139 	const struct tcphdr *th;
2140 	struct sock *sk = NULL;
2141 	bool refcounted;
2142 	int ret;
2143 	u32 isn;
2144 
2145 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2146 	if (skb->pkt_type != PACKET_HOST)
2147 		goto discard_it;
2148 
2149 	/* Count it even if it's bad */
2150 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2151 
2152 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2153 		goto discard_it;
2154 
2155 	th = (const struct tcphdr *)skb->data;
2156 
2157 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2158 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2159 		goto bad_packet;
2160 	}
2161 	if (!pskb_may_pull(skb, th->doff * 4))
2162 		goto discard_it;
2163 
2164 	/* An explanation is required here, I think.
2165 	 * Packet length and doff are validated by header prediction,
2166 	 * provided case of th->doff==0 is eliminated.
2167 	 * So, we defer the checks. */
2168 
2169 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2170 		goto csum_error;
2171 
2172 	th = (const struct tcphdr *)skb->data;
2173 	iph = ip_hdr(skb);
2174 lookup:
2175 	sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2176 			       th->dest, sdif, &refcounted);
2177 	if (!sk)
2178 		goto no_tcp_socket;
2179 
2180 	if (sk->sk_state == TCP_TIME_WAIT)
2181 		goto do_time_wait;
2182 
2183 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2184 		struct request_sock *req = inet_reqsk(sk);
2185 		bool req_stolen = false;
2186 		struct sock *nsk;
2187 
2188 		sk = req->rsk_listener;
2189 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2190 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2191 		else
2192 			drop_reason = tcp_inbound_hash(sk, req, skb,
2193 						       &iph->saddr, &iph->daddr,
2194 						       AF_INET, dif, sdif);
2195 		if (unlikely(drop_reason)) {
2196 			sk_drops_skbadd(sk, skb);
2197 			reqsk_put(req);
2198 			goto discard_it;
2199 		}
2200 		if (tcp_checksum_complete(skb)) {
2201 			reqsk_put(req);
2202 			goto csum_error;
2203 		}
2204 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2205 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2206 			if (!nsk) {
2207 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2208 				goto lookup;
2209 			}
2210 			sk = nsk;
2211 			/* reuseport_migrate_sock() has already held one sk_refcnt
2212 			 * before returning.
2213 			 */
2214 		} else {
2215 			/* We own a reference on the listener, increase it again
2216 			 * as we might lose it too soon.
2217 			 */
2218 			sock_hold(sk);
2219 		}
2220 		refcounted = true;
2221 		nsk = NULL;
2222 		if (!tcp_filter(sk, skb, &drop_reason)) {
2223 			th = (const struct tcphdr *)skb->data;
2224 			iph = ip_hdr(skb);
2225 			tcp_v4_fill_cb(skb, iph, th);
2226 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2227 					    &drop_reason);
2228 		}
2229 		if (!nsk) {
2230 			reqsk_put(req);
2231 			if (req_stolen) {
2232 				/* Another cpu got exclusive access to req
2233 				 * and created a full blown socket.
2234 				 * Try to feed this packet to this socket
2235 				 * instead of discarding it.
2236 				 */
2237 				tcp_v4_restore_cb(skb);
2238 				sock_put(sk);
2239 				goto lookup;
2240 			}
2241 			goto discard_and_relse;
2242 		}
2243 		nf_reset_ct(skb);
2244 		if (nsk == sk) {
2245 			reqsk_put(req);
2246 			tcp_v4_restore_cb(skb);
2247 		} else {
2248 			drop_reason = tcp_child_process(sk, nsk, skb);
2249 			if (drop_reason) {
2250 				enum sk_rst_reason rst_reason;
2251 
2252 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2253 				tcp_v4_send_reset(nsk, skb, rst_reason);
2254 				goto discard_and_relse;
2255 			}
2256 			sock_put(sk);
2257 			return 0;
2258 		}
2259 	}
2260 
2261 process:
2262 	if (static_branch_unlikely(&ip4_min_ttl)) {
2263 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2264 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2265 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2266 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2267 			goto discard_and_relse;
2268 		}
2269 	}
2270 
2271 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2272 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2273 		goto discard_and_relse;
2274 	}
2275 
2276 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2277 				       AF_INET, dif, sdif);
2278 	if (drop_reason)
2279 		goto discard_and_relse;
2280 
2281 	nf_reset_ct(skb);
2282 
2283 	if (tcp_filter(sk, skb, &drop_reason))
2284 		goto discard_and_relse;
2285 
2286 	th = (const struct tcphdr *)skb->data;
2287 	iph = ip_hdr(skb);
2288 	tcp_v4_fill_cb(skb, iph, th);
2289 
2290 	skb->dev = NULL;
2291 
2292 	if (sk->sk_state == TCP_LISTEN) {
2293 		ret = tcp_v4_do_rcv(sk, skb);
2294 		goto put_and_return;
2295 	}
2296 
2297 	sk_incoming_cpu_update(sk);
2298 
2299 	bh_lock_sock_nested(sk);
2300 	tcp_segs_in(tcp_sk(sk), skb);
2301 	ret = 0;
2302 	if (!sock_owned_by_user(sk)) {
2303 		ret = tcp_v4_do_rcv(sk, skb);
2304 	} else {
2305 		if (tcp_add_backlog(sk, skb, &drop_reason))
2306 			goto discard_and_relse;
2307 	}
2308 	bh_unlock_sock(sk);
2309 
2310 put_and_return:
2311 	if (refcounted)
2312 		sock_put(sk);
2313 
2314 	return ret;
2315 
2316 no_tcp_socket:
2317 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2318 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2319 		goto discard_it;
2320 
2321 	tcp_v4_fill_cb(skb, iph, th);
2322 
2323 	if (tcp_checksum_complete(skb)) {
2324 csum_error:
2325 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2326 		trace_tcp_bad_csum(skb);
2327 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2328 bad_packet:
2329 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2330 	} else {
2331 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2332 	}
2333 
2334 discard_it:
2335 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2336 	/* Discard frame. */
2337 	sk_skb_reason_drop(sk, skb, drop_reason);
2338 	return 0;
2339 
2340 discard_and_relse:
2341 	sk_drops_skbadd(sk, skb);
2342 	if (refcounted)
2343 		sock_put(sk);
2344 	goto discard_it;
2345 
2346 do_time_wait:
2347 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2348 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2349 		inet_twsk_put(inet_twsk(sk));
2350 		goto discard_it;
2351 	}
2352 
2353 	tcp_v4_fill_cb(skb, iph, th);
2354 
2355 	if (tcp_checksum_complete(skb)) {
2356 		inet_twsk_put(inet_twsk(sk));
2357 		goto csum_error;
2358 	}
2359 
2360 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2361 					       &drop_reason);
2362 	switch (tw_status) {
2363 	case TCP_TW_SYN: {
2364 		struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2365 							iph->saddr, th->source,
2366 							iph->daddr, th->dest,
2367 							inet_iif(skb),
2368 							sdif);
2369 		if (sk2) {
2370 			inet_twsk_deschedule_put(inet_twsk(sk));
2371 			sk = sk2;
2372 			tcp_v4_restore_cb(skb);
2373 			refcounted = false;
2374 			__this_cpu_write(tcp_tw_isn, isn);
2375 			goto process;
2376 		}
2377 
2378 		drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2379 		if (drop_reason)
2380 			break;
2381 	}
2382 		/* to ACK */
2383 		fallthrough;
2384 	case TCP_TW_ACK:
2385 	case TCP_TW_ACK_OOW:
2386 		tcp_v4_timewait_ack(sk, skb, tw_status);
2387 		break;
2388 	case TCP_TW_RST:
2389 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2390 		inet_twsk_deschedule_put(inet_twsk(sk));
2391 		goto discard_it;
2392 	case TCP_TW_SUCCESS:;
2393 	}
2394 	goto discard_it;
2395 }
2396 
2397 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2398 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2399 };
2400 
2401 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2402 {
2403 	struct dst_entry *dst = skb_dst(skb);
2404 
2405 	if (dst && dst_hold_safe(dst)) {
2406 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2407 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2408 	}
2409 }
2410 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2411 
2412 const struct inet_connection_sock_af_ops ipv4_specific = {
2413 	.queue_xmit	   = ip_queue_xmit,
2414 	.rebuild_header	   = inet_sk_rebuild_header,
2415 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2416 	.conn_request	   = tcp_v4_conn_request,
2417 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2418 	.net_header_len	   = sizeof(struct iphdr),
2419 	.setsockopt	   = ip_setsockopt,
2420 	.getsockopt	   = ip_getsockopt,
2421 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2422 };
2423 EXPORT_IPV6_MOD(ipv4_specific);
2424 
2425 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2426 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2427 #ifdef CONFIG_TCP_MD5SIG
2428 	.md5_lookup		= tcp_v4_md5_lookup,
2429 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2430 	.md5_parse		= tcp_v4_parse_md5_keys,
2431 #endif
2432 #ifdef CONFIG_TCP_AO
2433 	.ao_lookup		= tcp_v4_ao_lookup,
2434 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2435 	.ao_parse		= tcp_v4_parse_ao,
2436 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2437 #endif
2438 };
2439 
2440 static void tcp4_destruct_sock(struct sock *sk)
2441 {
2442 	tcp_md5_destruct_sock(sk);
2443 	tcp_ao_destroy_sock(sk, false);
2444 	inet_sock_destruct(sk);
2445 }
2446 #endif
2447 
2448 /* NOTE: A lot of things set to zero explicitly by call to
2449  *       sk_alloc() so need not be done here.
2450  */
2451 static int tcp_v4_init_sock(struct sock *sk)
2452 {
2453 	struct inet_connection_sock *icsk = inet_csk(sk);
2454 
2455 	tcp_init_sock(sk);
2456 
2457 	icsk->icsk_af_ops = &ipv4_specific;
2458 
2459 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2460 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2461 	sk->sk_destruct = tcp4_destruct_sock;
2462 #endif
2463 
2464 	return 0;
2465 }
2466 
2467 static void tcp_release_user_frags(struct sock *sk)
2468 {
2469 #ifdef CONFIG_PAGE_POOL
2470 	unsigned long index;
2471 	void *netmem;
2472 
2473 	xa_for_each(&sk->sk_user_frags, index, netmem)
2474 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2475 #endif
2476 }
2477 
2478 void tcp_v4_destroy_sock(struct sock *sk)
2479 {
2480 	struct tcp_sock *tp = tcp_sk(sk);
2481 
2482 	tcp_release_user_frags(sk);
2483 
2484 	xa_destroy(&sk->sk_user_frags);
2485 
2486 	trace_tcp_destroy_sock(sk);
2487 
2488 	tcp_clear_xmit_timers(sk);
2489 
2490 	tcp_cleanup_congestion_control(sk);
2491 
2492 	tcp_cleanup_ulp(sk);
2493 
2494 	/* Cleanup up the write buffer. */
2495 	tcp_write_queue_purge(sk);
2496 
2497 	/* Check if we want to disable active TFO */
2498 	tcp_fastopen_active_disable_ofo_check(sk);
2499 
2500 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2501 	skb_rbtree_purge(&tp->out_of_order_queue);
2502 
2503 	/* Clean up a referenced TCP bind bucket. */
2504 	if (inet_csk(sk)->icsk_bind_hash)
2505 		inet_put_port(sk);
2506 
2507 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2508 
2509 	/* If socket is aborted during connect operation */
2510 	tcp_free_fastopen_req(tp);
2511 	tcp_fastopen_destroy_cipher(sk);
2512 	tcp_saved_syn_free(tp);
2513 
2514 	sk_sockets_allocated_dec(sk);
2515 }
2516 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2517 
2518 #ifdef CONFIG_PROC_FS
2519 /* Proc filesystem TCP sock list dumping. */
2520 
2521 static unsigned short seq_file_family(const struct seq_file *seq);
2522 
2523 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2524 {
2525 	unsigned short family = seq_file_family(seq);
2526 
2527 	/* AF_UNSPEC is used as a match all */
2528 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2529 		net_eq(sock_net(sk), seq_file_net(seq)));
2530 }
2531 
2532 /* Find a non empty bucket (starting from st->bucket)
2533  * and return the first sk from it.
2534  */
2535 static void *listening_get_first(struct seq_file *seq)
2536 {
2537 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2538 	struct tcp_iter_state *st = seq->private;
2539 
2540 	st->offset = 0;
2541 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2542 		struct inet_listen_hashbucket *ilb2;
2543 		struct hlist_nulls_node *node;
2544 		struct sock *sk;
2545 
2546 		ilb2 = &hinfo->lhash2[st->bucket];
2547 		if (hlist_nulls_empty(&ilb2->nulls_head))
2548 			continue;
2549 
2550 		spin_lock(&ilb2->lock);
2551 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2552 			if (seq_sk_match(seq, sk))
2553 				return sk;
2554 		}
2555 		spin_unlock(&ilb2->lock);
2556 	}
2557 
2558 	return NULL;
2559 }
2560 
2561 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2562  * If "cur" is the last one in the st->bucket,
2563  * call listening_get_first() to return the first sk of the next
2564  * non empty bucket.
2565  */
2566 static void *listening_get_next(struct seq_file *seq, void *cur)
2567 {
2568 	struct tcp_iter_state *st = seq->private;
2569 	struct inet_listen_hashbucket *ilb2;
2570 	struct hlist_nulls_node *node;
2571 	struct inet_hashinfo *hinfo;
2572 	struct sock *sk = cur;
2573 
2574 	++st->num;
2575 	++st->offset;
2576 
2577 	sk = sk_nulls_next(sk);
2578 	sk_nulls_for_each_from(sk, node) {
2579 		if (seq_sk_match(seq, sk))
2580 			return sk;
2581 	}
2582 
2583 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2584 	ilb2 = &hinfo->lhash2[st->bucket];
2585 	spin_unlock(&ilb2->lock);
2586 	++st->bucket;
2587 	return listening_get_first(seq);
2588 }
2589 
2590 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2591 {
2592 	struct tcp_iter_state *st = seq->private;
2593 	void *rc;
2594 
2595 	st->bucket = 0;
2596 	st->offset = 0;
2597 	rc = listening_get_first(seq);
2598 
2599 	while (rc && *pos) {
2600 		rc = listening_get_next(seq, rc);
2601 		--*pos;
2602 	}
2603 	return rc;
2604 }
2605 
2606 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2607 				const struct tcp_iter_state *st)
2608 {
2609 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2610 }
2611 
2612 /*
2613  * Get first established socket starting from bucket given in st->bucket.
2614  * If st->bucket is zero, the very first socket in the hash is returned.
2615  */
2616 static void *established_get_first(struct seq_file *seq)
2617 {
2618 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2619 	struct tcp_iter_state *st = seq->private;
2620 
2621 	st->offset = 0;
2622 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2623 		struct sock *sk;
2624 		struct hlist_nulls_node *node;
2625 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2626 
2627 		cond_resched();
2628 
2629 		/* Lockless fast path for the common case of empty buckets */
2630 		if (empty_bucket(hinfo, st))
2631 			continue;
2632 
2633 		spin_lock_bh(lock);
2634 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2635 			if (seq_sk_match(seq, sk))
2636 				return sk;
2637 		}
2638 		spin_unlock_bh(lock);
2639 	}
2640 
2641 	return NULL;
2642 }
2643 
2644 static void *established_get_next(struct seq_file *seq, void *cur)
2645 {
2646 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2647 	struct tcp_iter_state *st = seq->private;
2648 	struct hlist_nulls_node *node;
2649 	struct sock *sk = cur;
2650 
2651 	++st->num;
2652 	++st->offset;
2653 
2654 	sk = sk_nulls_next(sk);
2655 
2656 	sk_nulls_for_each_from(sk, node) {
2657 		if (seq_sk_match(seq, sk))
2658 			return sk;
2659 	}
2660 
2661 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2662 	++st->bucket;
2663 	return established_get_first(seq);
2664 }
2665 
2666 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2667 {
2668 	struct tcp_iter_state *st = seq->private;
2669 	void *rc;
2670 
2671 	st->bucket = 0;
2672 	rc = established_get_first(seq);
2673 
2674 	while (rc && pos) {
2675 		rc = established_get_next(seq, rc);
2676 		--pos;
2677 	}
2678 	return rc;
2679 }
2680 
2681 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2682 {
2683 	void *rc;
2684 	struct tcp_iter_state *st = seq->private;
2685 
2686 	st->state = TCP_SEQ_STATE_LISTENING;
2687 	rc	  = listening_get_idx(seq, &pos);
2688 
2689 	if (!rc) {
2690 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2691 		rc	  = established_get_idx(seq, pos);
2692 	}
2693 
2694 	return rc;
2695 }
2696 
2697 static void *tcp_seek_last_pos(struct seq_file *seq)
2698 {
2699 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2700 	struct tcp_iter_state *st = seq->private;
2701 	int bucket = st->bucket;
2702 	int offset = st->offset;
2703 	int orig_num = st->num;
2704 	void *rc = NULL;
2705 
2706 	switch (st->state) {
2707 	case TCP_SEQ_STATE_LISTENING:
2708 		if (st->bucket > hinfo->lhash2_mask)
2709 			break;
2710 		rc = listening_get_first(seq);
2711 		while (offset-- && rc && bucket == st->bucket)
2712 			rc = listening_get_next(seq, rc);
2713 		if (rc)
2714 			break;
2715 		st->bucket = 0;
2716 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2717 		fallthrough;
2718 	case TCP_SEQ_STATE_ESTABLISHED:
2719 		if (st->bucket > hinfo->ehash_mask)
2720 			break;
2721 		rc = established_get_first(seq);
2722 		while (offset-- && rc && bucket == st->bucket)
2723 			rc = established_get_next(seq, rc);
2724 	}
2725 
2726 	st->num = orig_num;
2727 
2728 	return rc;
2729 }
2730 
2731 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2732 {
2733 	struct tcp_iter_state *st = seq->private;
2734 	void *rc;
2735 
2736 	if (*pos && *pos == st->last_pos) {
2737 		rc = tcp_seek_last_pos(seq);
2738 		if (rc)
2739 			goto out;
2740 	}
2741 
2742 	st->state = TCP_SEQ_STATE_LISTENING;
2743 	st->num = 0;
2744 	st->bucket = 0;
2745 	st->offset = 0;
2746 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2747 
2748 out:
2749 	st->last_pos = *pos;
2750 	return rc;
2751 }
2752 EXPORT_IPV6_MOD(tcp_seq_start);
2753 
2754 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2755 {
2756 	struct tcp_iter_state *st = seq->private;
2757 	void *rc = NULL;
2758 
2759 	if (v == SEQ_START_TOKEN) {
2760 		rc = tcp_get_idx(seq, 0);
2761 		goto out;
2762 	}
2763 
2764 	switch (st->state) {
2765 	case TCP_SEQ_STATE_LISTENING:
2766 		rc = listening_get_next(seq, v);
2767 		if (!rc) {
2768 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2769 			st->bucket = 0;
2770 			st->offset = 0;
2771 			rc	  = established_get_first(seq);
2772 		}
2773 		break;
2774 	case TCP_SEQ_STATE_ESTABLISHED:
2775 		rc = established_get_next(seq, v);
2776 		break;
2777 	}
2778 out:
2779 	++*pos;
2780 	st->last_pos = *pos;
2781 	return rc;
2782 }
2783 EXPORT_IPV6_MOD(tcp_seq_next);
2784 
2785 void tcp_seq_stop(struct seq_file *seq, void *v)
2786 {
2787 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2788 	struct tcp_iter_state *st = seq->private;
2789 
2790 	switch (st->state) {
2791 	case TCP_SEQ_STATE_LISTENING:
2792 		if (v != SEQ_START_TOKEN)
2793 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2794 		break;
2795 	case TCP_SEQ_STATE_ESTABLISHED:
2796 		if (v)
2797 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2798 		break;
2799 	}
2800 }
2801 EXPORT_IPV6_MOD(tcp_seq_stop);
2802 
2803 static void get_openreq4(const struct request_sock *req,
2804 			 struct seq_file *f, int i)
2805 {
2806 	const struct inet_request_sock *ireq = inet_rsk(req);
2807 	long delta = req->rsk_timer.expires - jiffies;
2808 
2809 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2810 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2811 		i,
2812 		ireq->ir_loc_addr,
2813 		ireq->ir_num,
2814 		ireq->ir_rmt_addr,
2815 		ntohs(ireq->ir_rmt_port),
2816 		TCP_SYN_RECV,
2817 		0, 0, /* could print option size, but that is af dependent. */
2818 		1,    /* timers active (only the expire timer) */
2819 		jiffies_delta_to_clock_t(delta),
2820 		req->num_timeout,
2821 		from_kuid_munged(seq_user_ns(f),
2822 				 sk_uid(req->rsk_listener)),
2823 		0,  /* non standard timer */
2824 		0, /* open_requests have no inode */
2825 		0,
2826 		req);
2827 }
2828 
2829 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2830 {
2831 	int timer_active;
2832 	unsigned long timer_expires;
2833 	const struct tcp_sock *tp = tcp_sk(sk);
2834 	const struct inet_connection_sock *icsk = inet_csk(sk);
2835 	const struct inet_sock *inet = inet_sk(sk);
2836 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2837 	__be32 dest = inet->inet_daddr;
2838 	__be32 src = inet->inet_rcv_saddr;
2839 	__u16 destp = ntohs(inet->inet_dport);
2840 	__u16 srcp = ntohs(inet->inet_sport);
2841 	u8 icsk_pending;
2842 	int rx_queue;
2843 	int state;
2844 
2845 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2846 	if (icsk_pending == ICSK_TIME_RETRANS ||
2847 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2848 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2849 		timer_active	= 1;
2850 		timer_expires	= tcp_timeout_expires(sk);
2851 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2852 		timer_active	= 4;
2853 		timer_expires	= tcp_timeout_expires(sk);
2854 	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
2855 		timer_active	= 2;
2856 		timer_expires	= icsk->icsk_keepalive_timer.expires;
2857 	} else {
2858 		timer_active	= 0;
2859 		timer_expires = jiffies;
2860 	}
2861 
2862 	state = inet_sk_state_load(sk);
2863 	if (state == TCP_LISTEN)
2864 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2865 	else
2866 		/* Because we don't lock the socket,
2867 		 * we might find a transient negative value.
2868 		 */
2869 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2870 				      READ_ONCE(tp->copied_seq), 0);
2871 
2872 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2873 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2874 		i, src, srcp, dest, destp, state,
2875 		READ_ONCE(tp->write_seq) - tp->snd_una,
2876 		rx_queue,
2877 		timer_active,
2878 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2879 		READ_ONCE(icsk->icsk_retransmits),
2880 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2881 		READ_ONCE(icsk->icsk_probes_out),
2882 		sock_i_ino(sk),
2883 		refcount_read(&sk->sk_refcnt), sk,
2884 		jiffies_to_clock_t(icsk->icsk_rto),
2885 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2886 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2887 		tcp_snd_cwnd(tp),
2888 		state == TCP_LISTEN ?
2889 		    fastopenq->max_qlen :
2890 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2891 }
2892 
2893 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2894 			       struct seq_file *f, int i)
2895 {
2896 	long delta = tw->tw_timer.expires - jiffies;
2897 	__be32 dest, src;
2898 	__u16 destp, srcp;
2899 
2900 	dest  = tw->tw_daddr;
2901 	src   = tw->tw_rcv_saddr;
2902 	destp = ntohs(tw->tw_dport);
2903 	srcp  = ntohs(tw->tw_sport);
2904 
2905 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2906 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2907 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2908 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2909 		refcount_read(&tw->tw_refcnt), tw);
2910 }
2911 
2912 #define TMPSZ 150
2913 
2914 static int tcp4_seq_show(struct seq_file *seq, void *v)
2915 {
2916 	struct tcp_iter_state *st;
2917 	struct sock *sk = v;
2918 
2919 	seq_setwidth(seq, TMPSZ - 1);
2920 	if (v == SEQ_START_TOKEN) {
2921 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2922 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2923 			   "inode");
2924 		goto out;
2925 	}
2926 	st = seq->private;
2927 
2928 	if (sk->sk_state == TCP_TIME_WAIT)
2929 		get_timewait4_sock(v, seq, st->num);
2930 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2931 		get_openreq4(v, seq, st->num);
2932 	else
2933 		get_tcp4_sock(v, seq, st->num);
2934 out:
2935 	seq_pad(seq, '\n');
2936 	return 0;
2937 }
2938 
2939 #ifdef CONFIG_BPF_SYSCALL
2940 union bpf_tcp_iter_batch_item {
2941 	struct sock *sk;
2942 	__u64 cookie;
2943 };
2944 
2945 struct bpf_tcp_iter_state {
2946 	struct tcp_iter_state state;
2947 	unsigned int cur_sk;
2948 	unsigned int end_sk;
2949 	unsigned int max_sk;
2950 	union bpf_tcp_iter_batch_item *batch;
2951 };
2952 
2953 struct bpf_iter__tcp {
2954 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2955 	__bpf_md_ptr(struct sock_common *, sk_common);
2956 	uid_t uid __aligned(8);
2957 };
2958 
2959 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2960 			     struct sock_common *sk_common, uid_t uid)
2961 {
2962 	struct bpf_iter__tcp ctx;
2963 
2964 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2965 	ctx.meta = meta;
2966 	ctx.sk_common = sk_common;
2967 	ctx.uid = uid;
2968 	return bpf_iter_run_prog(prog, &ctx);
2969 }
2970 
2971 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2972 {
2973 	union bpf_tcp_iter_batch_item *item;
2974 	unsigned int cur_sk = iter->cur_sk;
2975 	__u64 cookie;
2976 
2977 	/* Remember the cookies of the sockets we haven't seen yet, so we can
2978 	 * pick up where we left off next time around.
2979 	 */
2980 	while (cur_sk < iter->end_sk) {
2981 		item = &iter->batch[cur_sk++];
2982 		cookie = sock_gen_cookie(item->sk);
2983 		sock_gen_put(item->sk);
2984 		item->cookie = cookie;
2985 	}
2986 }
2987 
2988 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2989 				      unsigned int new_batch_sz, gfp_t flags)
2990 {
2991 	union bpf_tcp_iter_batch_item *new_batch;
2992 
2993 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2994 			     flags | __GFP_NOWARN);
2995 	if (!new_batch)
2996 		return -ENOMEM;
2997 
2998 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
2999 	kvfree(iter->batch);
3000 	iter->batch = new_batch;
3001 	iter->max_sk = new_batch_sz;
3002 
3003 	return 0;
3004 }
3005 
3006 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3007 					       union bpf_tcp_iter_batch_item *cookies,
3008 					       int n_cookies)
3009 {
3010 	struct hlist_nulls_node *node;
3011 	struct sock *sk;
3012 	int i;
3013 
3014 	for (i = 0; i < n_cookies; i++) {
3015 		sk = first_sk;
3016 		sk_nulls_for_each_from(sk, node)
3017 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3018 				return sk;
3019 	}
3020 
3021 	return NULL;
3022 }
3023 
3024 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3025 {
3026 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3027 	struct bpf_tcp_iter_state *iter = seq->private;
3028 	struct tcp_iter_state *st = &iter->state;
3029 	unsigned int find_cookie = iter->cur_sk;
3030 	unsigned int end_cookie = iter->end_sk;
3031 	int resume_bucket = st->bucket;
3032 	struct sock *sk;
3033 
3034 	if (end_cookie && find_cookie == end_cookie)
3035 		++st->bucket;
3036 
3037 	sk = listening_get_first(seq);
3038 	iter->cur_sk = 0;
3039 	iter->end_sk = 0;
3040 
3041 	if (sk && st->bucket == resume_bucket && end_cookie) {
3042 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3043 						end_cookie - find_cookie);
3044 		if (!sk) {
3045 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
3046 			++st->bucket;
3047 			sk = listening_get_first(seq);
3048 		}
3049 	}
3050 
3051 	return sk;
3052 }
3053 
3054 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3055 {
3056 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3057 	struct bpf_tcp_iter_state *iter = seq->private;
3058 	struct tcp_iter_state *st = &iter->state;
3059 	unsigned int find_cookie = iter->cur_sk;
3060 	unsigned int end_cookie = iter->end_sk;
3061 	int resume_bucket = st->bucket;
3062 	struct sock *sk;
3063 
3064 	if (end_cookie && find_cookie == end_cookie)
3065 		++st->bucket;
3066 
3067 	sk = established_get_first(seq);
3068 	iter->cur_sk = 0;
3069 	iter->end_sk = 0;
3070 
3071 	if (sk && st->bucket == resume_bucket && end_cookie) {
3072 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3073 						end_cookie - find_cookie);
3074 		if (!sk) {
3075 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3076 			++st->bucket;
3077 			sk = established_get_first(seq);
3078 		}
3079 	}
3080 
3081 	return sk;
3082 }
3083 
3084 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3085 {
3086 	struct bpf_tcp_iter_state *iter = seq->private;
3087 	struct tcp_iter_state *st = &iter->state;
3088 	struct sock *sk = NULL;
3089 
3090 	switch (st->state) {
3091 	case TCP_SEQ_STATE_LISTENING:
3092 		sk = bpf_iter_tcp_resume_listening(seq);
3093 		if (sk)
3094 			break;
3095 		st->bucket = 0;
3096 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3097 		fallthrough;
3098 	case TCP_SEQ_STATE_ESTABLISHED:
3099 		sk = bpf_iter_tcp_resume_established(seq);
3100 		break;
3101 	}
3102 
3103 	return sk;
3104 }
3105 
3106 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3107 						 struct sock **start_sk)
3108 {
3109 	struct bpf_tcp_iter_state *iter = seq->private;
3110 	struct hlist_nulls_node *node;
3111 	unsigned int expected = 1;
3112 	struct sock *sk;
3113 
3114 	sock_hold(*start_sk);
3115 	iter->batch[iter->end_sk++].sk = *start_sk;
3116 
3117 	sk = sk_nulls_next(*start_sk);
3118 	*start_sk = NULL;
3119 	sk_nulls_for_each_from(sk, node) {
3120 		if (seq_sk_match(seq, sk)) {
3121 			if (iter->end_sk < iter->max_sk) {
3122 				sock_hold(sk);
3123 				iter->batch[iter->end_sk++].sk = sk;
3124 			} else if (!*start_sk) {
3125 				/* Remember where we left off. */
3126 				*start_sk = sk;
3127 			}
3128 			expected++;
3129 		}
3130 	}
3131 
3132 	return expected;
3133 }
3134 
3135 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3136 						   struct sock **start_sk)
3137 {
3138 	struct bpf_tcp_iter_state *iter = seq->private;
3139 	struct hlist_nulls_node *node;
3140 	unsigned int expected = 1;
3141 	struct sock *sk;
3142 
3143 	sock_hold(*start_sk);
3144 	iter->batch[iter->end_sk++].sk = *start_sk;
3145 
3146 	sk = sk_nulls_next(*start_sk);
3147 	*start_sk = NULL;
3148 	sk_nulls_for_each_from(sk, node) {
3149 		if (seq_sk_match(seq, sk)) {
3150 			if (iter->end_sk < iter->max_sk) {
3151 				sock_hold(sk);
3152 				iter->batch[iter->end_sk++].sk = sk;
3153 			} else if (!*start_sk) {
3154 				/* Remember where we left off. */
3155 				*start_sk = sk;
3156 			}
3157 			expected++;
3158 		}
3159 	}
3160 
3161 	return expected;
3162 }
3163 
3164 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3165 					struct sock **start_sk)
3166 {
3167 	struct bpf_tcp_iter_state *iter = seq->private;
3168 	struct tcp_iter_state *st = &iter->state;
3169 
3170 	if (st->state == TCP_SEQ_STATE_LISTENING)
3171 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3172 	else
3173 		return bpf_iter_tcp_established_batch(seq, start_sk);
3174 }
3175 
3176 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3177 {
3178 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3179 	struct bpf_tcp_iter_state *iter = seq->private;
3180 	struct tcp_iter_state *st = &iter->state;
3181 
3182 	if (st->state == TCP_SEQ_STATE_LISTENING)
3183 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3184 	else
3185 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3186 }
3187 
3188 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3189 {
3190 	struct bpf_tcp_iter_state *iter = seq->private;
3191 	unsigned int expected;
3192 	struct sock *sk;
3193 	int err;
3194 
3195 	sk = bpf_iter_tcp_resume(seq);
3196 	if (!sk)
3197 		return NULL; /* Done */
3198 
3199 	expected = bpf_iter_fill_batch(seq, &sk);
3200 	if (likely(iter->end_sk == expected))
3201 		goto done;
3202 
3203 	/* Batch size was too small. */
3204 	bpf_iter_tcp_unlock_bucket(seq);
3205 	bpf_iter_tcp_put_batch(iter);
3206 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3207 					 GFP_USER);
3208 	if (err)
3209 		return ERR_PTR(err);
3210 
3211 	sk = bpf_iter_tcp_resume(seq);
3212 	if (!sk)
3213 		return NULL; /* Done */
3214 
3215 	expected = bpf_iter_fill_batch(seq, &sk);
3216 	if (likely(iter->end_sk == expected))
3217 		goto done;
3218 
3219 	/* Batch size was still too small. Hold onto the lock while we try
3220 	 * again with a larger batch to make sure the current bucket's size
3221 	 * does not change in the meantime.
3222 	 */
3223 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3224 	if (err) {
3225 		bpf_iter_tcp_unlock_bucket(seq);
3226 		return ERR_PTR(err);
3227 	}
3228 
3229 	expected = bpf_iter_fill_batch(seq, &sk);
3230 	WARN_ON_ONCE(iter->end_sk != expected);
3231 done:
3232 	bpf_iter_tcp_unlock_bucket(seq);
3233 	return iter->batch[0].sk;
3234 }
3235 
3236 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3237 {
3238 	/* bpf iter does not support lseek, so it always
3239 	 * continue from where it was stop()-ped.
3240 	 */
3241 	if (*pos)
3242 		return bpf_iter_tcp_batch(seq);
3243 
3244 	return SEQ_START_TOKEN;
3245 }
3246 
3247 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3248 {
3249 	struct bpf_tcp_iter_state *iter = seq->private;
3250 	struct tcp_iter_state *st = &iter->state;
3251 	struct sock *sk;
3252 
3253 	/* Whenever seq_next() is called, the iter->cur_sk is
3254 	 * done with seq_show(), so advance to the next sk in
3255 	 * the batch.
3256 	 */
3257 	if (iter->cur_sk < iter->end_sk) {
3258 		/* Keeping st->num consistent in tcp_iter_state.
3259 		 * bpf_iter_tcp does not use st->num.
3260 		 * meta.seq_num is used instead.
3261 		 */
3262 		st->num++;
3263 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3264 	}
3265 
3266 	if (iter->cur_sk < iter->end_sk)
3267 		sk = iter->batch[iter->cur_sk].sk;
3268 	else
3269 		sk = bpf_iter_tcp_batch(seq);
3270 
3271 	++*pos;
3272 	/* Keeping st->last_pos consistent in tcp_iter_state.
3273 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3274 	 */
3275 	st->last_pos = *pos;
3276 	return sk;
3277 }
3278 
3279 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3280 {
3281 	struct bpf_iter_meta meta;
3282 	struct bpf_prog *prog;
3283 	struct sock *sk = v;
3284 	uid_t uid;
3285 	int ret;
3286 
3287 	if (v == SEQ_START_TOKEN)
3288 		return 0;
3289 
3290 	if (sk_fullsock(sk))
3291 		lock_sock(sk);
3292 
3293 	if (unlikely(sk_unhashed(sk))) {
3294 		ret = SEQ_SKIP;
3295 		goto unlock;
3296 	}
3297 
3298 	if (sk->sk_state == TCP_TIME_WAIT) {
3299 		uid = 0;
3300 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3301 		const struct request_sock *req = v;
3302 
3303 		uid = from_kuid_munged(seq_user_ns(seq),
3304 				       sk_uid(req->rsk_listener));
3305 	} else {
3306 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3307 	}
3308 
3309 	meta.seq = seq;
3310 	prog = bpf_iter_get_info(&meta, false);
3311 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3312 
3313 unlock:
3314 	if (sk_fullsock(sk))
3315 		release_sock(sk);
3316 	return ret;
3317 
3318 }
3319 
3320 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3321 {
3322 	struct bpf_tcp_iter_state *iter = seq->private;
3323 	struct bpf_iter_meta meta;
3324 	struct bpf_prog *prog;
3325 
3326 	if (!v) {
3327 		meta.seq = seq;
3328 		prog = bpf_iter_get_info(&meta, true);
3329 		if (prog)
3330 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3331 	}
3332 
3333 	if (iter->cur_sk < iter->end_sk)
3334 		bpf_iter_tcp_put_batch(iter);
3335 }
3336 
3337 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3338 	.show		= bpf_iter_tcp_seq_show,
3339 	.start		= bpf_iter_tcp_seq_start,
3340 	.next		= bpf_iter_tcp_seq_next,
3341 	.stop		= bpf_iter_tcp_seq_stop,
3342 };
3343 #endif
3344 static unsigned short seq_file_family(const struct seq_file *seq)
3345 {
3346 	const struct tcp_seq_afinfo *afinfo;
3347 
3348 #ifdef CONFIG_BPF_SYSCALL
3349 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3350 	if (seq->op == &bpf_iter_tcp_seq_ops)
3351 		return AF_UNSPEC;
3352 #endif
3353 
3354 	/* Iterated from proc fs */
3355 	afinfo = pde_data(file_inode(seq->file));
3356 	return afinfo->family;
3357 }
3358 
3359 static const struct seq_operations tcp4_seq_ops = {
3360 	.show		= tcp4_seq_show,
3361 	.start		= tcp_seq_start,
3362 	.next		= tcp_seq_next,
3363 	.stop		= tcp_seq_stop,
3364 };
3365 
3366 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3367 	.family		= AF_INET,
3368 };
3369 
3370 static int __net_init tcp4_proc_init_net(struct net *net)
3371 {
3372 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3373 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3374 		return -ENOMEM;
3375 	return 0;
3376 }
3377 
3378 static void __net_exit tcp4_proc_exit_net(struct net *net)
3379 {
3380 	remove_proc_entry("tcp", net->proc_net);
3381 }
3382 
3383 static struct pernet_operations tcp4_net_ops = {
3384 	.init = tcp4_proc_init_net,
3385 	.exit = tcp4_proc_exit_net,
3386 };
3387 
3388 int __init tcp4_proc_init(void)
3389 {
3390 	return register_pernet_subsys(&tcp4_net_ops);
3391 }
3392 
3393 void tcp4_proc_exit(void)
3394 {
3395 	unregister_pernet_subsys(&tcp4_net_ops);
3396 }
3397 #endif /* CONFIG_PROC_FS */
3398 
3399 struct proto tcp_prot = {
3400 	.name			= "TCP",
3401 	.owner			= THIS_MODULE,
3402 	.close			= tcp_close,
3403 	.pre_connect		= tcp_v4_pre_connect,
3404 	.connect		= tcp_v4_connect,
3405 	.disconnect		= tcp_disconnect,
3406 	.accept			= inet_csk_accept,
3407 	.ioctl			= tcp_ioctl,
3408 	.init			= tcp_v4_init_sock,
3409 	.destroy		= tcp_v4_destroy_sock,
3410 	.shutdown		= tcp_shutdown,
3411 	.setsockopt		= tcp_setsockopt,
3412 	.getsockopt		= tcp_getsockopt,
3413 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3414 	.keepalive		= tcp_set_keepalive,
3415 	.recvmsg		= tcp_recvmsg,
3416 	.sendmsg		= tcp_sendmsg,
3417 	.splice_eof		= tcp_splice_eof,
3418 	.backlog_rcv		= tcp_v4_do_rcv,
3419 	.release_cb		= tcp_release_cb,
3420 	.hash			= inet_hash,
3421 	.unhash			= inet_unhash,
3422 	.get_port		= inet_csk_get_port,
3423 	.put_port		= inet_put_port,
3424 #ifdef CONFIG_BPF_SYSCALL
3425 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3426 #endif
3427 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3428 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3429 	.stream_memory_free	= tcp_stream_memory_free,
3430 	.sockets_allocated	= &tcp_sockets_allocated,
3431 
3432 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3433 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3434 
3435 	.memory_pressure	= &tcp_memory_pressure,
3436 	.sysctl_mem		= sysctl_tcp_mem,
3437 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3438 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3439 	.max_header		= MAX_TCP_HEADER,
3440 	.obj_size		= sizeof(struct tcp_sock),
3441 	.freeptr_offset		= offsetof(struct tcp_sock,
3442 					   inet_conn.icsk_inet.sk.sk_freeptr),
3443 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3444 	.twsk_prot		= &tcp_timewait_sock_ops,
3445 	.rsk_prot		= &tcp_request_sock_ops,
3446 	.h.hashinfo		= NULL,
3447 	.no_autobind		= true,
3448 	.diag_destroy		= tcp_abort,
3449 };
3450 EXPORT_SYMBOL(tcp_prot);
3451 
3452 static void __net_exit tcp_sk_exit(struct net *net)
3453 {
3454 	if (net->ipv4.tcp_congestion_control)
3455 		bpf_module_put(net->ipv4.tcp_congestion_control,
3456 			       net->ipv4.tcp_congestion_control->owner);
3457 }
3458 
3459 static void __net_init tcp_set_hashinfo(struct net *net)
3460 {
3461 	struct inet_hashinfo *hinfo;
3462 	unsigned int ehash_entries;
3463 	struct net *old_net;
3464 
3465 	if (net_eq(net, &init_net))
3466 		goto fallback;
3467 
3468 	old_net = current->nsproxy->net_ns;
3469 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3470 	if (!ehash_entries)
3471 		goto fallback;
3472 
3473 	ehash_entries = roundup_pow_of_two(ehash_entries);
3474 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3475 	if (!hinfo) {
3476 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3477 			"for a netns, fallback to the global one\n",
3478 			ehash_entries);
3479 fallback:
3480 		hinfo = &tcp_hashinfo;
3481 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3482 	}
3483 
3484 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3485 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3486 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3487 }
3488 
3489 static int __net_init tcp_sk_init(struct net *net)
3490 {
3491 	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3492 	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3493 	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3494 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3495 
3496 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3497 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3498 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3499 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3500 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3501 
3502 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3503 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3504 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3505 
3506 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3507 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3508 	net->ipv4.sysctl_tcp_syncookies = 1;
3509 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3510 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3511 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3512 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3513 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3514 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3515 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3516 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3517 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3518 
3519 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3520 	tcp_set_hashinfo(net);
3521 
3522 	net->ipv4.sysctl_tcp_sack = 1;
3523 	net->ipv4.sysctl_tcp_window_scaling = 1;
3524 	net->ipv4.sysctl_tcp_timestamps = 1;
3525 	net->ipv4.sysctl_tcp_early_retrans = 3;
3526 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3527 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3528 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3529 	net->ipv4.sysctl_tcp_max_reordering = 300;
3530 	net->ipv4.sysctl_tcp_dsack = 1;
3531 	net->ipv4.sysctl_tcp_app_win = 31;
3532 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3533 	net->ipv4.sysctl_tcp_frto = 2;
3534 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3535 	net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3536 	/* This limits the percentage of the congestion window which we
3537 	 * will allow a single TSO frame to consume.  Building TSO frames
3538 	 * which are too large can cause TCP streams to be bursty.
3539 	 */
3540 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3541 	/* Default TSQ limit of 4 MB */
3542 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3543 
3544 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3545 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3546 
3547 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3548 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3549 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3550 	net->ipv4.sysctl_tcp_autocorking = 1;
3551 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3552 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3553 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3554 	if (net != &init_net) {
3555 		memcpy(net->ipv4.sysctl_tcp_rmem,
3556 		       init_net.ipv4.sysctl_tcp_rmem,
3557 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3558 		memcpy(net->ipv4.sysctl_tcp_wmem,
3559 		       init_net.ipv4.sysctl_tcp_wmem,
3560 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3561 	}
3562 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3563 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
3564 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3565 	net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
3566 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3567 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3568 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3569 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3570 
3571 	/* Set default values for PLB */
3572 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3573 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3574 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3575 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3576 	/* Default congestion threshold for PLB to mark a round is 50% */
3577 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3578 
3579 	/* Reno is always built in */
3580 	if (!net_eq(net, &init_net) &&
3581 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3582 			       init_net.ipv4.tcp_congestion_control->owner))
3583 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3584 	else
3585 		net->ipv4.tcp_congestion_control = &tcp_reno;
3586 
3587 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3588 	net->ipv4.sysctl_tcp_shrink_window = 0;
3589 
3590 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3591 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3592 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3593 
3594 	return 0;
3595 }
3596 
3597 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3598 {
3599 	struct net *net;
3600 
3601 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3602 	 * and failed setup_net error unwinding path are serialized.
3603 	 *
3604 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3605 	 * net_exit_list, the thread that dismantles a particular twsk must
3606 	 * do so without other thread progressing to refcount_dec_and_test() of
3607 	 * tcp_death_row.tw_refcount.
3608 	 */
3609 	mutex_lock(&tcp_exit_batch_mutex);
3610 
3611 	tcp_twsk_purge(net_exit_list);
3612 
3613 	list_for_each_entry(net, net_exit_list, exit_list) {
3614 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3615 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3616 		tcp_fastopen_ctx_destroy(net);
3617 	}
3618 
3619 	mutex_unlock(&tcp_exit_batch_mutex);
3620 }
3621 
3622 static struct pernet_operations __net_initdata tcp_sk_ops = {
3623        .init	   = tcp_sk_init,
3624        .exit	   = tcp_sk_exit,
3625        .exit_batch = tcp_sk_exit_batch,
3626 };
3627 
3628 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3629 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3630 		     struct sock_common *sk_common, uid_t uid)
3631 
3632 #define INIT_BATCH_SZ 16
3633 
3634 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3635 {
3636 	struct bpf_tcp_iter_state *iter = priv_data;
3637 	int err;
3638 
3639 	err = bpf_iter_init_seq_net(priv_data, aux);
3640 	if (err)
3641 		return err;
3642 
3643 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3644 	if (err) {
3645 		bpf_iter_fini_seq_net(priv_data);
3646 		return err;
3647 	}
3648 
3649 	return 0;
3650 }
3651 
3652 static void bpf_iter_fini_tcp(void *priv_data)
3653 {
3654 	struct bpf_tcp_iter_state *iter = priv_data;
3655 
3656 	bpf_iter_fini_seq_net(priv_data);
3657 	kvfree(iter->batch);
3658 }
3659 
3660 static const struct bpf_iter_seq_info tcp_seq_info = {
3661 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3662 	.init_seq_private	= bpf_iter_init_tcp,
3663 	.fini_seq_private	= bpf_iter_fini_tcp,
3664 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3665 };
3666 
3667 static const struct bpf_func_proto *
3668 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3669 			    const struct bpf_prog *prog)
3670 {
3671 	switch (func_id) {
3672 	case BPF_FUNC_setsockopt:
3673 		return &bpf_sk_setsockopt_proto;
3674 	case BPF_FUNC_getsockopt:
3675 		return &bpf_sk_getsockopt_proto;
3676 	default:
3677 		return NULL;
3678 	}
3679 }
3680 
3681 static struct bpf_iter_reg tcp_reg_info = {
3682 	.target			= "tcp",
3683 	.ctx_arg_info_size	= 1,
3684 	.ctx_arg_info		= {
3685 		{ offsetof(struct bpf_iter__tcp, sk_common),
3686 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3687 	},
3688 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3689 	.seq_info		= &tcp_seq_info,
3690 };
3691 
3692 static void __init bpf_iter_register(void)
3693 {
3694 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3695 	if (bpf_iter_reg_target(&tcp_reg_info))
3696 		pr_warn("Warning: could not register bpf iterator tcp\n");
3697 }
3698 
3699 #endif
3700 
3701 void __init tcp_v4_init(void)
3702 {
3703 	int cpu, res;
3704 
3705 	for_each_possible_cpu(cpu) {
3706 		struct sock *sk;
3707 
3708 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3709 					   IPPROTO_TCP, &init_net);
3710 		if (res)
3711 			panic("Failed to create the TCP control socket.\n");
3712 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3713 
3714 		/* Please enforce IP_DF and IPID==0 for RST and
3715 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3716 		 */
3717 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3718 
3719 		sk->sk_clockid = CLOCK_MONOTONIC;
3720 
3721 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3722 	}
3723 	if (register_pernet_subsys(&tcp_sk_ops))
3724 		panic("Failed to create the TCP control socket.\n");
3725 
3726 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3727 	bpf_iter_register();
3728 #endif
3729 }
3730