xref: /linux/net/ipv4/tcp_ipv4.c (revision c17ee635fd3a482b2ad2bf5e269755c2eae5f25e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/fips.h>
57 #include <linux/jhash.h>
58 #include <linux/init.h>
59 #include <linux/times.h>
60 #include <linux/slab.h>
61 #include <linux/sched.h>
62 #include <linux/sock_diag.h>
63 
64 #include <net/aligned_data.h>
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/tcp_ecn.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/inet_ecn.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/busy_poll.h>
78 #include <net/rstreason.h>
79 #include <net/psp.h>
80 
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86 #include <linux/inetdevice.h>
87 #include <linux/btf_ids.h>
88 #include <linux/skbuff_ref.h>
89 
90 #include <crypto/md5.h>
91 
92 #include <trace/events/tcp.h>
93 
94 #ifdef CONFIG_TCP_MD5SIG
95 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
96 				__be32 daddr, __be32 saddr, const struct tcphdr *th);
97 #endif
98 
99 struct inet_hashinfo tcp_hashinfo;
100 
101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
102 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
103 };
104 
105 static DEFINE_MUTEX(tcp_exit_batch_mutex);
106 
107 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
108 {
109 	return secure_tcp_seq(ip_hdr(skb)->daddr,
110 			      ip_hdr(skb)->saddr,
111 			      tcp_hdr(skb)->dest,
112 			      tcp_hdr(skb)->source);
113 }
114 
115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
116 {
117 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
118 }
119 
120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
121 {
122 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
123 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
124 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
125 	struct tcp_sock *tp = tcp_sk(sk);
126 	int ts_recent_stamp;
127 	u32 reuse_thresh;
128 
129 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
130 		reuse = 0;
131 
132 	if (reuse == 2) {
133 		/* Still does not detect *everything* that goes through
134 		 * lo, since we require a loopback src or dst address
135 		 * or direct binding to 'lo' interface.
136 		 */
137 		bool loopback = false;
138 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
139 			loopback = true;
140 #if IS_ENABLED(CONFIG_IPV6)
141 		if (tw->tw_family == AF_INET6) {
142 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
143 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
144 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
145 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
146 				loopback = true;
147 		} else
148 #endif
149 		{
150 			if (ipv4_is_loopback(tw->tw_daddr) ||
151 			    ipv4_is_loopback(tw->tw_rcv_saddr))
152 				loopback = true;
153 		}
154 		if (!loopback)
155 			reuse = 0;
156 	}
157 
158 	/* With PAWS, it is safe from the viewpoint
159 	   of data integrity. Even without PAWS it is safe provided sequence
160 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161 
162 	   Actually, the idea is close to VJ's one, only timestamp cache is
163 	   held not per host, but per port pair and TW bucket is used as state
164 	   holder.
165 
166 	   If TW bucket has been already destroyed we fall back to VJ's scheme
167 	   and use initial timestamp retrieved from peer table.
168 	 */
169 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
170 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
171 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
172 	if (ts_recent_stamp &&
173 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
174 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
175 		 * and releasing the bucket lock.
176 		 */
177 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
178 			return 0;
179 
180 		/* In case of repair and re-using TIME-WAIT sockets we still
181 		 * want to be sure that it is safe as above but honor the
182 		 * sequence numbers and time stamps set as part of the repair
183 		 * process.
184 		 *
185 		 * Without this check re-using a TIME-WAIT socket with TCP
186 		 * repair would accumulate a -1 on the repair assigned
187 		 * sequence number. The first time it is reused the sequence
188 		 * is -1, the second time -2, etc. This fixes that issue
189 		 * without appearing to create any others.
190 		 */
191 		if (likely(!tp->repair)) {
192 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
193 
194 			if (!seq)
195 				seq = 1;
196 			WRITE_ONCE(tp->write_seq, seq);
197 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
198 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
199 		}
200 
201 		return 1;
202 	}
203 
204 	return 0;
205 }
206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
207 
208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
209 			      int addr_len)
210 {
211 	/* This check is replicated from tcp_v4_connect() and intended to
212 	 * prevent BPF program called below from accessing bytes that are out
213 	 * of the bound specified by user in addr_len.
214 	 */
215 	if (addr_len < sizeof(struct sockaddr_in))
216 		return -EINVAL;
217 
218 	sock_owned_by_me(sk);
219 
220 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
221 }
222 
223 /* This will initiate an outgoing connection. */
224 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
225 {
226 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
227 	struct inet_timewait_death_row *tcp_death_row;
228 	struct inet_sock *inet = inet_sk(sk);
229 	struct tcp_sock *tp = tcp_sk(sk);
230 	struct ip_options_rcu *inet_opt;
231 	struct net *net = sock_net(sk);
232 	__be16 orig_sport, orig_dport;
233 	__be32 daddr, nexthop;
234 	struct flowi4 *fl4;
235 	struct rtable *rt;
236 	int err;
237 
238 	if (addr_len < sizeof(struct sockaddr_in))
239 		return -EINVAL;
240 
241 	if (usin->sin_family != AF_INET)
242 		return -EAFNOSUPPORT;
243 
244 	nexthop = daddr = usin->sin_addr.s_addr;
245 	inet_opt = rcu_dereference_protected(inet->inet_opt,
246 					     lockdep_sock_is_held(sk));
247 	if (inet_opt && inet_opt->opt.srr) {
248 		if (!daddr)
249 			return -EINVAL;
250 		nexthop = inet_opt->opt.faddr;
251 	}
252 
253 	orig_sport = inet->inet_sport;
254 	orig_dport = usin->sin_port;
255 	fl4 = &inet->cork.fl.u.ip4;
256 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
257 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
258 			      orig_dport, sk);
259 	if (IS_ERR(rt)) {
260 		err = PTR_ERR(rt);
261 		if (err == -ENETUNREACH)
262 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
263 		return err;
264 	}
265 
266 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
267 		ip_rt_put(rt);
268 		return -ENETUNREACH;
269 	}
270 
271 	if (!inet_opt || !inet_opt->opt.srr)
272 		daddr = fl4->daddr;
273 
274 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
275 
276 	if (!inet->inet_saddr) {
277 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
278 		if (err) {
279 			ip_rt_put(rt);
280 			return err;
281 		}
282 	} else {
283 		sk_rcv_saddr_set(sk, inet->inet_saddr);
284 	}
285 
286 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
287 		/* Reset inherited state */
288 		tp->rx_opt.ts_recent	   = 0;
289 		tp->rx_opt.ts_recent_stamp = 0;
290 		if (likely(!tp->repair))
291 			WRITE_ONCE(tp->write_seq, 0);
292 	}
293 
294 	inet->inet_dport = usin->sin_port;
295 	sk_daddr_set(sk, daddr);
296 
297 	inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
298 	if (inet_opt)
299 		inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
300 
301 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
302 
303 	/* Socket identity is still unknown (sport may be zero).
304 	 * However we set state to SYN-SENT and not releasing socket
305 	 * lock select source port, enter ourselves into the hash tables and
306 	 * complete initialization after this.
307 	 */
308 	tcp_set_state(sk, TCP_SYN_SENT);
309 	err = inet_hash_connect(tcp_death_row, sk);
310 	if (err)
311 		goto failure;
312 
313 	sk_set_txhash(sk);
314 
315 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
316 			       inet->inet_sport, inet->inet_dport, sk);
317 	if (IS_ERR(rt)) {
318 		err = PTR_ERR(rt);
319 		rt = NULL;
320 		goto failure;
321 	}
322 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
323 	/* OK, now commit destination to socket.  */
324 	sk->sk_gso_type = SKB_GSO_TCPV4;
325 	sk_setup_caps(sk, &rt->dst);
326 	rt = NULL;
327 
328 	if (likely(!tp->repair)) {
329 		if (!tp->write_seq)
330 			WRITE_ONCE(tp->write_seq,
331 				   secure_tcp_seq(inet->inet_saddr,
332 						  inet->inet_daddr,
333 						  inet->inet_sport,
334 						  usin->sin_port));
335 		WRITE_ONCE(tp->tsoffset,
336 			   secure_tcp_ts_off(net, inet->inet_saddr,
337 					     inet->inet_daddr));
338 	}
339 
340 	atomic_set(&inet->inet_id, get_random_u16());
341 
342 	if (tcp_fastopen_defer_connect(sk, &err))
343 		return err;
344 	if (err)
345 		goto failure;
346 
347 	err = tcp_connect(sk);
348 
349 	if (err)
350 		goto failure;
351 
352 	return 0;
353 
354 failure:
355 	/*
356 	 * This unhashes the socket and releases the local port,
357 	 * if necessary.
358 	 */
359 	tcp_set_state(sk, TCP_CLOSE);
360 	inet_bhash2_reset_saddr(sk);
361 	ip_rt_put(rt);
362 	sk->sk_route_caps = 0;
363 	inet->inet_dport = 0;
364 	return err;
365 }
366 EXPORT_IPV6_MOD(tcp_v4_connect);
367 
368 /*
369  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
370  * It can be called through tcp_release_cb() if socket was owned by user
371  * at the time tcp_v4_err() was called to handle ICMP message.
372  */
373 void tcp_v4_mtu_reduced(struct sock *sk)
374 {
375 	struct inet_sock *inet = inet_sk(sk);
376 	struct dst_entry *dst;
377 	u32 mtu, dmtu;
378 
379 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
380 		return;
381 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
382 	dst = inet_csk_update_pmtu(sk, mtu);
383 	if (!dst)
384 		return;
385 
386 	/* Something is about to be wrong... Remember soft error
387 	 * for the case, if this connection will not able to recover.
388 	 */
389 	dmtu = dst4_mtu(dst);
390 	if (mtu < dmtu && ip_dont_fragment(sk, dst))
391 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
392 
393 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
394 	    ip_sk_accept_pmtu(sk) &&
395 	    inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
396 		tcp_sync_mss(sk, dmtu);
397 
398 		/* Resend the TCP packet because it's
399 		 * clear that the old packet has been
400 		 * dropped. This is the new "fast" path mtu
401 		 * discovery.
402 		 */
403 		tcp_simple_retransmit(sk);
404 	} /* else let the usual retransmit timer handle it */
405 }
406 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
407 
408 static void do_redirect(struct sk_buff *skb, struct sock *sk)
409 {
410 	struct dst_entry *dst = __sk_dst_check(sk, 0);
411 
412 	if (dst)
413 		dst->ops->redirect(dst, sk, skb);
414 }
415 
416 
417 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
418 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
419 {
420 	struct request_sock *req = inet_reqsk(sk);
421 	struct net *net = sock_net(sk);
422 
423 	/* ICMPs are not backlogged, hence we cannot get
424 	 * an established socket here.
425 	 */
426 	if (seq != tcp_rsk(req)->snt_isn) {
427 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
428 	} else if (abort) {
429 		/*
430 		 * Still in SYN_RECV, just remove it silently.
431 		 * There is no good way to pass the error to the newly
432 		 * created socket, and POSIX does not want network
433 		 * errors returned from accept().
434 		 */
435 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
436 		tcp_listendrop(req->rsk_listener);
437 	}
438 	reqsk_put(req);
439 }
440 EXPORT_IPV6_MOD(tcp_req_err);
441 
442 /* TCP-LD (RFC 6069) logic */
443 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
444 {
445 	struct inet_connection_sock *icsk = inet_csk(sk);
446 	struct tcp_sock *tp = tcp_sk(sk);
447 	struct sk_buff *skb;
448 	s32 remaining;
449 	u32 delta_us;
450 
451 	if (sock_owned_by_user(sk))
452 		return;
453 
454 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
455 	    !icsk->icsk_backoff)
456 		return;
457 
458 	skb = tcp_rtx_queue_head(sk);
459 	if (WARN_ON_ONCE(!skb))
460 		return;
461 
462 	icsk->icsk_backoff--;
463 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
464 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
465 
466 	tcp_mstamp_refresh(tp);
467 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
468 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
469 
470 	if (remaining > 0) {
471 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
472 	} else {
473 		/* RTO revert clocked out retransmission.
474 		 * Will retransmit now.
475 		 */
476 		tcp_retransmit_timer(sk);
477 	}
478 }
479 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
480 
481 /*
482  * This routine is called by the ICMP module when it gets some
483  * sort of error condition.  If err < 0 then the socket should
484  * be closed and the error returned to the user.  If err > 0
485  * it's just the icmp type << 8 | icmp code.  After adjustment
486  * header points to the first 8 bytes of the tcp header.  We need
487  * to find the appropriate port.
488  *
489  * The locking strategy used here is very "optimistic". When
490  * someone else accesses the socket the ICMP is just dropped
491  * and for some paths there is no check at all.
492  * A more general error queue to queue errors for later handling
493  * is probably better.
494  *
495  */
496 
497 int tcp_v4_err(struct sk_buff *skb, u32 info)
498 {
499 	const struct iphdr *iph = (const struct iphdr *)skb->data;
500 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
501 	struct net *net = dev_net_rcu(skb->dev);
502 	const int type = icmp_hdr(skb)->type;
503 	const int code = icmp_hdr(skb)->code;
504 	struct request_sock *fastopen;
505 	struct tcp_sock *tp;
506 	u32 seq, snd_una;
507 	struct sock *sk;
508 	int err;
509 
510 	sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
511 				       ntohs(th->source), inet_iif(skb), 0);
512 	if (!sk) {
513 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
514 		return -ENOENT;
515 	}
516 	if (sk->sk_state == TCP_TIME_WAIT) {
517 		/* To increase the counter of ignored icmps for TCP-AO */
518 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
519 		inet_twsk_put(inet_twsk(sk));
520 		return 0;
521 	}
522 	seq = ntohl(th->seq);
523 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
524 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
525 				     type == ICMP_TIME_EXCEEDED ||
526 				     (type == ICMP_DEST_UNREACH &&
527 				      (code == ICMP_NET_UNREACH ||
528 				       code == ICMP_HOST_UNREACH)));
529 		return 0;
530 	}
531 
532 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
533 		sock_put(sk);
534 		return 0;
535 	}
536 
537 	bh_lock_sock(sk);
538 	/* If too many ICMPs get dropped on busy
539 	 * servers this needs to be solved differently.
540 	 * We do take care of PMTU discovery (RFC1191) special case :
541 	 * we can receive locally generated ICMP messages while socket is held.
542 	 */
543 	if (sock_owned_by_user(sk)) {
544 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
545 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
546 	}
547 	if (sk->sk_state == TCP_CLOSE)
548 		goto out;
549 
550 	if (static_branch_unlikely(&ip4_min_ttl)) {
551 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
552 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
553 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
554 			goto out;
555 		}
556 	}
557 
558 	tp = tcp_sk(sk);
559 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
560 	fastopen = rcu_dereference(tp->fastopen_rsk);
561 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
562 	if (sk->sk_state != TCP_LISTEN &&
563 	    !between(seq, snd_una, tp->snd_nxt)) {
564 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
565 		goto out;
566 	}
567 
568 	switch (type) {
569 	case ICMP_REDIRECT:
570 		if (!sock_owned_by_user(sk))
571 			do_redirect(skb, sk);
572 		goto out;
573 	case ICMP_SOURCE_QUENCH:
574 		/* Just silently ignore these. */
575 		goto out;
576 	case ICMP_PARAMETERPROB:
577 		err = EPROTO;
578 		break;
579 	case ICMP_DEST_UNREACH:
580 		if (code > NR_ICMP_UNREACH)
581 			goto out;
582 
583 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
584 			/* We are not interested in TCP_LISTEN and open_requests
585 			 * (SYN-ACKs send out by Linux are always <576bytes so
586 			 * they should go through unfragmented).
587 			 */
588 			if (sk->sk_state == TCP_LISTEN)
589 				goto out;
590 
591 			WRITE_ONCE(tp->mtu_info, info);
592 			if (!sock_owned_by_user(sk)) {
593 				tcp_v4_mtu_reduced(sk);
594 			} else {
595 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
596 					sock_hold(sk);
597 			}
598 			goto out;
599 		}
600 
601 		err = icmp_err_convert[code].errno;
602 		/* check if this ICMP message allows revert of backoff.
603 		 * (see RFC 6069)
604 		 */
605 		if (!fastopen &&
606 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
607 			tcp_ld_RTO_revert(sk, seq);
608 		break;
609 	case ICMP_TIME_EXCEEDED:
610 		err = EHOSTUNREACH;
611 		break;
612 	default:
613 		goto out;
614 	}
615 
616 	switch (sk->sk_state) {
617 	case TCP_SYN_SENT:
618 	case TCP_SYN_RECV:
619 		/* Only in fast or simultaneous open. If a fast open socket is
620 		 * already accepted it is treated as a connected one below.
621 		 */
622 		if (fastopen && !fastopen->sk)
623 			break;
624 
625 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
626 
627 		if (!sock_owned_by_user(sk))
628 			tcp_done_with_error(sk, err);
629 		else
630 			WRITE_ONCE(sk->sk_err_soft, err);
631 		goto out;
632 	}
633 
634 	/* If we've already connected we will keep trying
635 	 * until we time out, or the user gives up.
636 	 *
637 	 * rfc1122 4.2.3.9 allows to consider as hard errors
638 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
639 	 * but it is obsoleted by pmtu discovery).
640 	 *
641 	 * Note, that in modern internet, where routing is unreliable
642 	 * and in each dark corner broken firewalls sit, sending random
643 	 * errors ordered by their masters even this two messages finally lose
644 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
645 	 *
646 	 * Now we are in compliance with RFCs.
647 	 *							--ANK (980905)
648 	 */
649 
650 	if (!sock_owned_by_user(sk) &&
651 	    inet_test_bit(RECVERR, sk)) {
652 		WRITE_ONCE(sk->sk_err, err);
653 		sk_error_report(sk);
654 	} else	{ /* Only an error on timeout */
655 		WRITE_ONCE(sk->sk_err_soft, err);
656 	}
657 
658 out:
659 	bh_unlock_sock(sk);
660 	sock_put(sk);
661 	return 0;
662 }
663 
664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
665 {
666 	struct tcphdr *th = tcp_hdr(skb);
667 
668 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
669 	skb->csum_start = skb_transport_header(skb) - skb->head;
670 	skb->csum_offset = offsetof(struct tcphdr, check);
671 }
672 
673 /* This routine computes an IPv4 TCP checksum. */
674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
675 {
676 	const struct inet_sock *inet = inet_sk(sk);
677 
678 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
679 }
680 EXPORT_IPV6_MOD(tcp_v4_send_check);
681 
682 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
683 
684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
685 				 const struct tcp_ao_hdr *aoh,
686 				 struct ip_reply_arg *arg, struct tcphdr *reply,
687 				 __be32 reply_options[REPLY_OPTIONS_LEN])
688 {
689 #ifdef CONFIG_TCP_AO
690 	int sdif = tcp_v4_sdif(skb);
691 	int dif = inet_iif(skb);
692 	int l3index = sdif ? dif : 0;
693 	bool allocated_traffic_key;
694 	struct tcp_ao_key *key;
695 	char *traffic_key;
696 	bool drop = true;
697 	u32 ao_sne = 0;
698 	u8 keyid;
699 
700 	rcu_read_lock();
701 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
702 				 &key, &traffic_key, &allocated_traffic_key,
703 				 &keyid, &ao_sne))
704 		goto out;
705 
706 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
707 				 (aoh->rnext_keyid << 8) | keyid);
708 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
709 	reply->doff = arg->iov[0].iov_len / 4;
710 
711 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
712 			    key, traffic_key,
713 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
714 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
715 			    reply, ao_sne))
716 		goto out;
717 	drop = false;
718 out:
719 	rcu_read_unlock();
720 	if (allocated_traffic_key)
721 		kfree(traffic_key);
722 	return drop;
723 #else
724 	return true;
725 #endif
726 }
727 
728 /*
729  *	This routine will send an RST to the other tcp.
730  *
731  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
732  *		      for reset.
733  *	Answer: if a packet caused RST, it is not for a socket
734  *		existing in our system, if it is matched to a socket,
735  *		it is just duplicate segment or bug in other side's TCP.
736  *		So that we build reply only basing on parameters
737  *		arrived with segment.
738  *	Exception: precedence violation. We do not implement it in any case.
739  */
740 
741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
742 			      enum sk_rst_reason reason)
743 {
744 	const struct tcphdr *th = tcp_hdr(skb);
745 	struct {
746 		struct tcphdr th;
747 		__be32 opt[REPLY_OPTIONS_LEN];
748 	} rep;
749 	const __u8 *md5_hash_location = NULL;
750 	const struct tcp_ao_hdr *aoh;
751 	struct ip_reply_arg arg;
752 #ifdef CONFIG_TCP_MD5SIG
753 	struct tcp_md5sig_key *key = NULL;
754 	unsigned char newhash[16];
755 	struct sock *sk1 = NULL;
756 #endif
757 	u64 transmit_time = 0;
758 	struct sock *ctl_sk;
759 	struct net *net;
760 	u32 txhash = 0;
761 
762 	/* Never send a reset in response to a reset. */
763 	if (th->rst)
764 		return;
765 
766 	/* If sk not NULL, it means we did a successful lookup and incoming
767 	 * route had to be correct. prequeue might have dropped our dst.
768 	 */
769 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
770 		return;
771 
772 	/* Swap the send and the receive. */
773 	memset(&rep, 0, sizeof(rep));
774 	rep.th.dest   = th->source;
775 	rep.th.source = th->dest;
776 	rep.th.doff   = sizeof(struct tcphdr) / 4;
777 	rep.th.rst    = 1;
778 
779 	if (th->ack) {
780 		rep.th.seq = th->ack_seq;
781 	} else {
782 		rep.th.ack = 1;
783 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
784 				       skb->len - (th->doff << 2));
785 	}
786 
787 	memset(&arg, 0, sizeof(arg));
788 	arg.iov[0].iov_base = (unsigned char *)&rep;
789 	arg.iov[0].iov_len  = sizeof(rep.th);
790 
791 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
792 
793 	/* Invalid TCP option size or twice included auth */
794 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
795 		return;
796 
797 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
798 		return;
799 
800 #ifdef CONFIG_TCP_MD5SIG
801 	rcu_read_lock();
802 	if (sk && sk_fullsock(sk)) {
803 		const union tcp_md5_addr *addr;
804 		int l3index;
805 
806 		/* sdif set, means packet ingressed via a device
807 		 * in an L3 domain and inet_iif is set to it.
808 		 */
809 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
810 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
811 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
812 	} else if (md5_hash_location) {
813 		const union tcp_md5_addr *addr;
814 		int sdif = tcp_v4_sdif(skb);
815 		int dif = inet_iif(skb);
816 		int l3index;
817 
818 		/*
819 		 * active side is lost. Try to find listening socket through
820 		 * source port, and then find md5 key through listening socket.
821 		 * we are not loose security here:
822 		 * Incoming packet is checked with md5 hash with finding key,
823 		 * no RST generated if md5 hash doesn't match.
824 		 */
825 		sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
826 					     th->source, ip_hdr(skb)->daddr,
827 					     ntohs(th->source), dif, sdif);
828 		/* don't send rst if it can't find key */
829 		if (!sk1)
830 			goto out;
831 
832 		/* sdif set, means packet ingressed via a device
833 		 * in an L3 domain and dif is set to it.
834 		 */
835 		l3index = sdif ? dif : 0;
836 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
837 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
838 		if (!key)
839 			goto out;
840 
841 		tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
842 		if (memcmp(md5_hash_location, newhash, 16) != 0)
843 			goto out;
844 	}
845 
846 	if (key) {
847 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
848 				   (TCPOPT_NOP << 16) |
849 				   (TCPOPT_MD5SIG << 8) |
850 				   TCPOLEN_MD5SIG);
851 		/* Update length and the length the header thinks exists */
852 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
853 		rep.th.doff = arg.iov[0].iov_len / 4;
854 
855 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
856 				     key, ip_hdr(skb)->saddr,
857 				     ip_hdr(skb)->daddr, &rep.th);
858 	}
859 #endif
860 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
861 	if (rep.opt[0] == 0) {
862 		__be32 mrst = mptcp_reset_option(skb);
863 
864 		if (mrst) {
865 			rep.opt[0] = mrst;
866 			arg.iov[0].iov_len += sizeof(mrst);
867 			rep.th.doff = arg.iov[0].iov_len / 4;
868 		}
869 	}
870 
871 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
872 				      ip_hdr(skb)->saddr, /* XXX */
873 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
874 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
875 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
876 
877 	/* When socket is gone, all binding information is lost.
878 	 * routing might fail in this case. No choice here, if we choose to force
879 	 * input interface, we will misroute in case of asymmetric route.
880 	 */
881 	if (sk)
882 		arg.bound_dev_if = sk->sk_bound_dev_if;
883 
884 	trace_tcp_send_reset(sk, skb, reason);
885 
886 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
887 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
888 
889 	/* ECN bits of TW reset are cleared */
890 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
891 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
894 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
895 
896 	sock_net_set(ctl_sk, net);
897 	if (sk) {
898 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
899 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
900 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
901 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
902 		transmit_time = tcp_transmit_time(sk);
903 		xfrm_sk_clone_policy(ctl_sk, sk);
904 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
905 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
906 	} else {
907 		ctl_sk->sk_mark = 0;
908 		ctl_sk->sk_priority = 0;
909 	}
910 	ip_send_unicast_reply(ctl_sk, sk,
911 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
912 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913 			      &arg, arg.iov[0].iov_len,
914 			      transmit_time, txhash);
915 
916 	xfrm_sk_free_policy(ctl_sk);
917 	sock_net_set(ctl_sk, &init_net);
918 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
919 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
920 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
921 	local_bh_enable();
922 
923 #ifdef CONFIG_TCP_MD5SIG
924 out:
925 	rcu_read_unlock();
926 #endif
927 }
928 
929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
930    outside socket context is ugly, certainly. What can I do?
931  */
932 
933 static void tcp_v4_send_ack(const struct sock *sk,
934 			    struct sk_buff *skb, u32 seq, u32 ack,
935 			    u32 win, u32 tsval, u32 tsecr, int oif,
936 			    struct tcp_key *key,
937 			    int reply_flags, u8 tos, u32 txhash)
938 {
939 	const struct tcphdr *th = tcp_hdr(skb);
940 	struct {
941 		struct tcphdr th;
942 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
943 	} rep;
944 	struct net *net = sock_net(sk);
945 	struct ip_reply_arg arg;
946 	struct sock *ctl_sk;
947 	u64 transmit_time;
948 
949 	memset(&rep.th, 0, sizeof(struct tcphdr));
950 	memset(&arg, 0, sizeof(arg));
951 
952 	arg.iov[0].iov_base = (unsigned char *)&rep;
953 	arg.iov[0].iov_len  = sizeof(rep.th);
954 	if (tsecr) {
955 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
956 				   (TCPOPT_TIMESTAMP << 8) |
957 				   TCPOLEN_TIMESTAMP);
958 		rep.opt[1] = htonl(tsval);
959 		rep.opt[2] = htonl(tsecr);
960 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
961 	}
962 
963 	/* Swap the send and the receive. */
964 	rep.th.dest    = th->source;
965 	rep.th.source  = th->dest;
966 	rep.th.doff    = arg.iov[0].iov_len / 4;
967 	rep.th.seq     = htonl(seq);
968 	rep.th.ack_seq = htonl(ack);
969 	rep.th.ack     = 1;
970 	rep.th.window  = htons(win);
971 
972 #ifdef CONFIG_TCP_MD5SIG
973 	if (tcp_key_is_md5(key)) {
974 		int offset = (tsecr) ? 3 : 0;
975 
976 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
977 					  (TCPOPT_NOP << 16) |
978 					  (TCPOPT_MD5SIG << 8) |
979 					  TCPOLEN_MD5SIG);
980 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
981 		rep.th.doff = arg.iov[0].iov_len/4;
982 
983 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
984 				    key->md5_key, ip_hdr(skb)->saddr,
985 				    ip_hdr(skb)->daddr, &rep.th);
986 	}
987 #endif
988 #ifdef CONFIG_TCP_AO
989 	if (tcp_key_is_ao(key)) {
990 		int offset = (tsecr) ? 3 : 0;
991 
992 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
993 					  (tcp_ao_len(key->ao_key) << 16) |
994 					  (key->ao_key->sndid << 8) |
995 					  key->rcv_next);
996 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
997 		rep.th.doff = arg.iov[0].iov_len / 4;
998 
999 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1000 				key->ao_key, key->traffic_key,
1001 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1002 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1003 				&rep.th, key->sne);
1004 	}
1005 #endif
1006 	arg.flags = reply_flags;
1007 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1008 				      ip_hdr(skb)->saddr, /* XXX */
1009 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1010 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1011 	if (oif)
1012 		arg.bound_dev_if = oif;
1013 	arg.tos = tos;
1014 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1015 	local_bh_disable();
1016 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1017 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1018 	sock_net_set(ctl_sk, net);
1019 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1020 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1021 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1022 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1023 	transmit_time = tcp_transmit_time(sk);
1024 	ip_send_unicast_reply(ctl_sk, sk,
1025 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1026 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1027 			      &arg, arg.iov[0].iov_len,
1028 			      transmit_time, txhash);
1029 
1030 	sock_net_set(ctl_sk, &init_net);
1031 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1032 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1033 	local_bh_enable();
1034 }
1035 
1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1037 				enum tcp_tw_status tw_status)
1038 {
1039 	struct inet_timewait_sock *tw = inet_twsk(sk);
1040 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1041 	struct tcp_key key = {};
1042 	u8 tos = tw->tw_tos;
1043 
1044 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1045 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1046 	 * being placed in a different service queues (Classic rather than L4S)
1047 	 */
1048 	if (tw_status == TCP_TW_ACK_OOW)
1049 		tos &= ~INET_ECN_MASK;
1050 
1051 #ifdef CONFIG_TCP_AO
1052 	struct tcp_ao_info *ao_info;
1053 
1054 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1055 		/* FIXME: the segment to-be-acked is not verified yet */
1056 		ao_info = rcu_dereference(tcptw->ao_info);
1057 		if (ao_info) {
1058 			const struct tcp_ao_hdr *aoh;
1059 
1060 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1061 				inet_twsk_put(tw);
1062 				return;
1063 			}
1064 
1065 			if (aoh)
1066 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1067 								    aoh->rnext_keyid, -1);
1068 		}
1069 	}
1070 	if (key.ao_key) {
1071 		struct tcp_ao_key *rnext_key;
1072 
1073 		key.traffic_key = snd_other_key(key.ao_key);
1074 		key.sne = READ_ONCE(ao_info->snd_sne);
1075 		rnext_key = READ_ONCE(ao_info->rnext_key);
1076 		key.rcv_next = rnext_key->rcvid;
1077 		key.type = TCP_KEY_AO;
1078 #else
1079 	if (0) {
1080 #endif
1081 	} else if (static_branch_tcp_md5()) {
1082 		key.md5_key = tcp_twsk_md5_key(tcptw);
1083 		if (key.md5_key)
1084 			key.type = TCP_KEY_MD5;
1085 	}
1086 
1087 	tcp_v4_send_ack(sk, skb,
1088 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1089 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1090 			tcp_tw_tsval(tcptw),
1091 			READ_ONCE(tcptw->tw_ts_recent),
1092 			tw->tw_bound_dev_if, &key,
1093 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1094 			tos,
1095 			tw->tw_txhash);
1096 
1097 	inet_twsk_put(tw);
1098 }
1099 
1100 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1101 				  struct request_sock *req)
1102 {
1103 	struct tcp_key key = {};
1104 
1105 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1106 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1107 	 */
1108 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1109 					     tcp_sk(sk)->snd_nxt;
1110 
1111 #ifdef CONFIG_TCP_AO
1112 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1113 	    tcp_rsk_used_ao(req)) {
1114 		const union tcp_md5_addr *addr;
1115 		const struct tcp_ao_hdr *aoh;
1116 		int l3index;
1117 
1118 		/* Invalid TCP option size or twice included auth */
1119 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1120 			return;
1121 		if (!aoh)
1122 			return;
1123 
1124 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1125 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1126 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1127 					      aoh->rnext_keyid, -1);
1128 		if (unlikely(!key.ao_key)) {
1129 			/* Send ACK with any matching MKT for the peer */
1130 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1131 			/* Matching key disappeared (user removed the key?)
1132 			 * let the handshake timeout.
1133 			 */
1134 			if (!key.ao_key) {
1135 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1136 						     addr,
1137 						     ntohs(tcp_hdr(skb)->source),
1138 						     &ip_hdr(skb)->daddr,
1139 						     ntohs(tcp_hdr(skb)->dest));
1140 				return;
1141 			}
1142 		}
1143 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1144 		if (!key.traffic_key)
1145 			return;
1146 
1147 		key.type = TCP_KEY_AO;
1148 		key.rcv_next = aoh->keyid;
1149 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1150 #else
1151 	if (0) {
1152 #endif
1153 	} else if (static_branch_tcp_md5()) {
1154 		const union tcp_md5_addr *addr;
1155 		int l3index;
1156 
1157 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1158 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1159 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1160 		if (key.md5_key)
1161 			key.type = TCP_KEY_MD5;
1162 	}
1163 
1164 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1165 	tcp_v4_send_ack(sk, skb, seq,
1166 			tcp_rsk(req)->rcv_nxt,
1167 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1168 			tcp_rsk_tsval(tcp_rsk(req)),
1169 			req->ts_recent,
1170 			0, &key,
1171 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1172 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1173 			READ_ONCE(tcp_rsk(req)->txhash));
1174 	if (tcp_key_is_ao(&key))
1175 		kfree(key.traffic_key);
1176 }
1177 
1178 /*
1179  *	Send a SYN-ACK after having received a SYN.
1180  *	This still operates on a request_sock only, not on a big
1181  *	socket.
1182  */
1183 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1184 			      struct flowi *fl,
1185 			      struct request_sock *req,
1186 			      struct tcp_fastopen_cookie *foc,
1187 			      enum tcp_synack_type synack_type,
1188 			      struct sk_buff *syn_skb)
1189 {
1190 	struct inet_request_sock *ireq = inet_rsk(req);
1191 	struct flowi4 fl4;
1192 	int err = -1;
1193 	struct sk_buff *skb;
1194 	u8 tos;
1195 
1196 	/* First, grab a route. */
1197 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1198 		return -1;
1199 
1200 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1201 
1202 	if (skb) {
1203 		tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1204 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1205 
1206 		tos = READ_ONCE(inet_sk(sk)->tos);
1207 
1208 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1209 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1210 			      (tos & INET_ECN_MASK);
1211 
1212 		if (!INET_ECN_is_capable(tos) &&
1213 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1214 			tos |= INET_ECN_ECT_0;
1215 
1216 		rcu_read_lock();
1217 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1218 					    ireq->ir_rmt_addr,
1219 					    rcu_dereference(ireq->ireq_opt),
1220 					    tos);
1221 		rcu_read_unlock();
1222 		err = net_xmit_eval(err);
1223 	}
1224 
1225 	return err;
1226 }
1227 
1228 /*
1229  *	IPv4 request_sock destructor.
1230  */
1231 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1232 {
1233 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1234 }
1235 
1236 #ifdef CONFIG_TCP_MD5SIG
1237 /*
1238  * RFC2385 MD5 checksumming requires a mapping of
1239  * IP address->MD5 Key.
1240  * We need to maintain these in the sk structure.
1241  */
1242 
1243 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1244 EXPORT_IPV6_MOD(tcp_md5_needed);
1245 
1246 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1247 {
1248 	if (!old)
1249 		return true;
1250 
1251 	/* l3index always overrides non-l3index */
1252 	if (old->l3index && new->l3index == 0)
1253 		return false;
1254 	if (old->l3index == 0 && new->l3index)
1255 		return true;
1256 
1257 	return old->prefixlen < new->prefixlen;
1258 }
1259 
1260 /* Find the Key structure for an address.  */
1261 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1262 					   const union tcp_md5_addr *addr,
1263 					   int family, bool any_l3index)
1264 {
1265 	const struct tcp_sock *tp = tcp_sk(sk);
1266 	struct tcp_md5sig_key *key;
1267 	const struct tcp_md5sig_info *md5sig;
1268 	__be32 mask;
1269 	struct tcp_md5sig_key *best_match = NULL;
1270 	bool match;
1271 
1272 	/* caller either holds rcu_read_lock() or socket lock */
1273 	md5sig = rcu_dereference_check(tp->md5sig_info,
1274 				       lockdep_sock_is_held(sk));
1275 	if (!md5sig)
1276 		return NULL;
1277 
1278 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1279 				 lockdep_sock_is_held(sk)) {
1280 		if (key->family != family)
1281 			continue;
1282 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1283 		    key->l3index != l3index)
1284 			continue;
1285 		if (family == AF_INET) {
1286 			mask = inet_make_mask(key->prefixlen);
1287 			match = (key->addr.a4.s_addr & mask) ==
1288 				(addr->a4.s_addr & mask);
1289 #if IS_ENABLED(CONFIG_IPV6)
1290 		} else if (family == AF_INET6) {
1291 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1292 						  key->prefixlen);
1293 #endif
1294 		} else {
1295 			match = false;
1296 		}
1297 
1298 		if (match && better_md5_match(best_match, key))
1299 			best_match = key;
1300 	}
1301 	return best_match;
1302 }
1303 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1304 
1305 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1306 						      const union tcp_md5_addr *addr,
1307 						      int family, u8 prefixlen,
1308 						      int l3index, u8 flags)
1309 {
1310 	const struct tcp_sock *tp = tcp_sk(sk);
1311 	struct tcp_md5sig_key *key;
1312 	unsigned int size = sizeof(struct in_addr);
1313 	const struct tcp_md5sig_info *md5sig;
1314 
1315 	/* caller either holds rcu_read_lock() or socket lock */
1316 	md5sig = rcu_dereference_check(tp->md5sig_info,
1317 				       lockdep_sock_is_held(sk));
1318 	if (!md5sig)
1319 		return NULL;
1320 #if IS_ENABLED(CONFIG_IPV6)
1321 	if (family == AF_INET6)
1322 		size = sizeof(struct in6_addr);
1323 #endif
1324 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1325 				 lockdep_sock_is_held(sk)) {
1326 		if (key->family != family)
1327 			continue;
1328 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1329 			continue;
1330 		if (key->l3index != l3index)
1331 			continue;
1332 		if (!memcmp(&key->addr, addr, size) &&
1333 		    key->prefixlen == prefixlen)
1334 			return key;
1335 	}
1336 	return NULL;
1337 }
1338 
1339 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1340 					 const struct sock *addr_sk)
1341 {
1342 	const union tcp_md5_addr *addr;
1343 	int l3index;
1344 
1345 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1346 						 addr_sk->sk_bound_dev_if);
1347 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1348 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1349 }
1350 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1351 
1352 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1353 {
1354 	struct tcp_sock *tp = tcp_sk(sk);
1355 	struct tcp_md5sig_info *md5sig;
1356 
1357 	md5sig = kmalloc_obj(*md5sig, gfp);
1358 	if (!md5sig)
1359 		return -ENOMEM;
1360 
1361 	sk_gso_disable(sk);
1362 	INIT_HLIST_HEAD(&md5sig->head);
1363 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1364 	return 0;
1365 }
1366 
1367 /* This can be called on a newly created socket, from other files */
1368 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1369 			    int family, u8 prefixlen, int l3index, u8 flags,
1370 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1371 {
1372 	/* Add Key to the list */
1373 	struct tcp_md5sig_key *key;
1374 	struct tcp_sock *tp = tcp_sk(sk);
1375 	struct tcp_md5sig_info *md5sig;
1376 
1377 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1378 	if (key) {
1379 		/* Pre-existing entry - just update that one.
1380 		 * Note that the key might be used concurrently.
1381 		 * data_race() is telling kcsan that we do not care of
1382 		 * key mismatches, since changing MD5 key on live flows
1383 		 * can lead to packet drops.
1384 		 */
1385 		data_race(memcpy(key->key, newkey, newkeylen));
1386 
1387 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1388 		 * Also note that a reader could catch new key->keylen value
1389 		 * but old key->key[], this is the reason we use __GFP_ZERO
1390 		 * at sock_kmalloc() time below these lines.
1391 		 */
1392 		WRITE_ONCE(key->keylen, newkeylen);
1393 
1394 		return 0;
1395 	}
1396 
1397 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1398 					   lockdep_sock_is_held(sk));
1399 
1400 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1401 	if (!key)
1402 		return -ENOMEM;
1403 
1404 	memcpy(key->key, newkey, newkeylen);
1405 	key->keylen = newkeylen;
1406 	key->family = family;
1407 	key->prefixlen = prefixlen;
1408 	key->l3index = l3index;
1409 	key->flags = flags;
1410 	memcpy(&key->addr, addr,
1411 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1412 								 sizeof(struct in_addr));
1413 	hlist_add_head_rcu(&key->node, &md5sig->head);
1414 	return 0;
1415 }
1416 
1417 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1418 		   int family, u8 prefixlen, int l3index, u8 flags,
1419 		   const u8 *newkey, u8 newkeylen)
1420 {
1421 	struct tcp_sock *tp = tcp_sk(sk);
1422 
1423 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1424 		if (fips_enabled) {
1425 			pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1426 			return -EOPNOTSUPP;
1427 		}
1428 
1429 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1430 			return -ENOMEM;
1431 
1432 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1433 			struct tcp_md5sig_info *md5sig;
1434 
1435 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1436 			rcu_assign_pointer(tp->md5sig_info, NULL);
1437 			kfree_rcu(md5sig, rcu);
1438 			return -EUSERS;
1439 		}
1440 	}
1441 
1442 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1443 				newkey, newkeylen, GFP_KERNEL);
1444 }
1445 EXPORT_IPV6_MOD(tcp_md5_do_add);
1446 
1447 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1448 		     int family, u8 prefixlen, int l3index,
1449 		     struct tcp_md5sig_key *key)
1450 {
1451 	struct tcp_sock *tp = tcp_sk(sk);
1452 
1453 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1454 
1455 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1456 			return -ENOMEM;
1457 
1458 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1459 			struct tcp_md5sig_info *md5sig;
1460 
1461 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1462 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1463 			rcu_assign_pointer(tp->md5sig_info, NULL);
1464 			kfree_rcu(md5sig, rcu);
1465 			return -EUSERS;
1466 		}
1467 	}
1468 
1469 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1470 				key->flags, key->key, key->keylen,
1471 				sk_gfp_mask(sk, GFP_ATOMIC));
1472 }
1473 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1474 
1475 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1476 		   u8 prefixlen, int l3index, u8 flags)
1477 {
1478 	struct tcp_md5sig_key *key;
1479 
1480 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1481 	if (!key)
1482 		return -ENOENT;
1483 	hlist_del_rcu(&key->node);
1484 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1485 	kfree_rcu(key, rcu);
1486 	return 0;
1487 }
1488 EXPORT_IPV6_MOD(tcp_md5_do_del);
1489 
1490 void tcp_clear_md5_list(struct sock *sk)
1491 {
1492 	struct tcp_sock *tp = tcp_sk(sk);
1493 	struct tcp_md5sig_key *key;
1494 	struct hlist_node *n;
1495 	struct tcp_md5sig_info *md5sig;
1496 
1497 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1498 
1499 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1500 		hlist_del(&key->node);
1501 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1502 		kfree(key);
1503 	}
1504 }
1505 
1506 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1507 				 sockptr_t optval, int optlen)
1508 {
1509 	struct tcp_md5sig cmd;
1510 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1511 	const union tcp_md5_addr *addr;
1512 	u8 prefixlen = 32;
1513 	int l3index = 0;
1514 	bool l3flag;
1515 	u8 flags;
1516 
1517 	if (optlen < sizeof(cmd))
1518 		return -EINVAL;
1519 
1520 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1521 		return -EFAULT;
1522 
1523 	if (sin->sin_family != AF_INET)
1524 		return -EINVAL;
1525 
1526 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1527 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1528 
1529 	if (optname == TCP_MD5SIG_EXT &&
1530 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1531 		prefixlen = cmd.tcpm_prefixlen;
1532 		if (prefixlen > 32)
1533 			return -EINVAL;
1534 	}
1535 
1536 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1537 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1538 		struct net_device *dev;
1539 
1540 		rcu_read_lock();
1541 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1542 		if (dev && netif_is_l3_master(dev))
1543 			l3index = dev->ifindex;
1544 
1545 		rcu_read_unlock();
1546 
1547 		/* ok to reference set/not set outside of rcu;
1548 		 * right now device MUST be an L3 master
1549 		 */
1550 		if (!dev || !l3index)
1551 			return -EINVAL;
1552 	}
1553 
1554 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1555 
1556 	if (!cmd.tcpm_keylen)
1557 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1558 
1559 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1560 		return -EINVAL;
1561 
1562 	/* Don't allow keys for peers that have a matching TCP-AO key.
1563 	 * See the comment in tcp_ao_add_cmd()
1564 	 */
1565 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1566 		return -EKEYREJECTED;
1567 
1568 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1569 			      cmd.tcpm_key, cmd.tcpm_keylen);
1570 }
1571 
1572 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1573 				    __be32 daddr, __be32 saddr,
1574 				    const struct tcphdr *th, int nbytes)
1575 {
1576 	struct {
1577 		struct tcp4_pseudohdr ip;
1578 		struct tcphdr tcp;
1579 	} h;
1580 
1581 	h.ip.saddr = saddr;
1582 	h.ip.daddr = daddr;
1583 	h.ip.pad = 0;
1584 	h.ip.protocol = IPPROTO_TCP;
1585 	h.ip.len = cpu_to_be16(nbytes);
1586 	h.tcp = *th;
1587 	h.tcp.check = 0;
1588 	md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
1589 }
1590 
1591 static noinline_for_stack void
1592 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1593 		    __be32 daddr, __be32 saddr, const struct tcphdr *th)
1594 {
1595 	struct md5_ctx ctx;
1596 
1597 	md5_init(&ctx);
1598 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
1599 	tcp_md5_hash_key(&ctx, key);
1600 	md5_final(&ctx, md5_hash);
1601 }
1602 
1603 noinline_for_stack void
1604 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1605 		    const struct sock *sk, const struct sk_buff *skb)
1606 {
1607 	const struct tcphdr *th = tcp_hdr(skb);
1608 	__be32 saddr, daddr;
1609 	struct md5_ctx ctx;
1610 
1611 	if (sk) { /* valid for establish/request sockets */
1612 		saddr = sk->sk_rcv_saddr;
1613 		daddr = sk->sk_daddr;
1614 	} else {
1615 		const struct iphdr *iph = ip_hdr(skb);
1616 		saddr = iph->saddr;
1617 		daddr = iph->daddr;
1618 	}
1619 
1620 	md5_init(&ctx);
1621 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
1622 	tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
1623 	tcp_md5_hash_key(&ctx, key);
1624 	md5_final(&ctx, md5_hash);
1625 }
1626 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1627 
1628 #endif
1629 
1630 static void tcp_v4_init_req(struct request_sock *req,
1631 			    const struct sock *sk_listener,
1632 			    struct sk_buff *skb)
1633 {
1634 	struct inet_request_sock *ireq = inet_rsk(req);
1635 	struct net *net = sock_net(sk_listener);
1636 
1637 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1638 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1639 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1640 }
1641 
1642 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1643 					  struct sk_buff *skb,
1644 					  struct flowi *fl,
1645 					  struct request_sock *req,
1646 					  u32 tw_isn)
1647 {
1648 	tcp_v4_init_req(req, sk, skb);
1649 
1650 	if (security_inet_conn_request(sk, skb, req))
1651 		return NULL;
1652 
1653 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1654 }
1655 
1656 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1657 	.family		=	PF_INET,
1658 	.obj_size	=	sizeof(struct tcp_request_sock),
1659 	.send_ack	=	tcp_v4_reqsk_send_ack,
1660 	.destructor	=	tcp_v4_reqsk_destructor,
1661 	.send_reset	=	tcp_v4_send_reset,
1662 };
1663 
1664 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1665 	.mss_clamp	=	TCP_MSS_DEFAULT,
1666 #ifdef CONFIG_TCP_MD5SIG
1667 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1668 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1669 #endif
1670 #ifdef CONFIG_TCP_AO
1671 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1672 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1673 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1674 #endif
1675 #ifdef CONFIG_SYN_COOKIES
1676 	.cookie_init_seq =	cookie_v4_init_sequence,
1677 #endif
1678 	.route_req	=	tcp_v4_route_req,
1679 	.init_seq	=	tcp_v4_init_seq,
1680 	.init_ts_off	=	tcp_v4_init_ts_off,
1681 	.send_synack	=	tcp_v4_send_synack,
1682 };
1683 
1684 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1685 {
1686 	/* Never answer to SYNs send to broadcast or multicast */
1687 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1688 		goto drop;
1689 
1690 	return tcp_conn_request(&tcp_request_sock_ops,
1691 				&tcp_request_sock_ipv4_ops, sk, skb);
1692 
1693 drop:
1694 	tcp_listendrop(sk);
1695 	return 0;
1696 }
1697 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1698 
1699 
1700 /*
1701  * The three way handshake has completed - we got a valid synack -
1702  * now create the new socket.
1703  */
1704 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1705 				  struct request_sock *req,
1706 				  struct dst_entry *dst,
1707 				  struct request_sock *req_unhash,
1708 				  bool *own_req)
1709 {
1710 	struct inet_request_sock *ireq;
1711 	bool found_dup_sk = false;
1712 	struct inet_sock *newinet;
1713 	struct tcp_sock *newtp;
1714 	struct sock *newsk;
1715 #ifdef CONFIG_TCP_MD5SIG
1716 	const union tcp_md5_addr *addr;
1717 	struct tcp_md5sig_key *key;
1718 	int l3index;
1719 #endif
1720 	struct ip_options_rcu *inet_opt;
1721 
1722 	if (sk_acceptq_is_full(sk))
1723 		goto exit_overflow;
1724 
1725 	newsk = tcp_create_openreq_child(sk, req, skb);
1726 	if (!newsk)
1727 		goto exit_nonewsk;
1728 
1729 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1730 	inet_sk_rx_dst_set(newsk, skb);
1731 
1732 	newtp		      = tcp_sk(newsk);
1733 	newinet		      = inet_sk(newsk);
1734 	ireq		      = inet_rsk(req);
1735 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1736 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1737 	newinet->mc_index     = inet_iif(skb);
1738 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1739 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1740 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1741 	if (inet_opt)
1742 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1743 	atomic_set(&newinet->inet_id, get_random_u16());
1744 
1745 	/* Set ToS of the new socket based upon the value of incoming SYN.
1746 	 * ECT bits are set later in tcp_init_transfer().
1747 	 */
1748 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1749 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1750 
1751 	if (!dst) {
1752 		dst = inet_csk_route_child_sock(sk, newsk, req);
1753 		if (!dst)
1754 			goto put_and_exit;
1755 	} else {
1756 		/* syncookie case : see end of cookie_v4_check() */
1757 	}
1758 	sk_setup_caps(newsk, dst);
1759 
1760 	tcp_ca_openreq_child(newsk, dst);
1761 
1762 	tcp_sync_mss(newsk, dst4_mtu(dst));
1763 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1764 
1765 	tcp_initialize_rcv_mss(newsk);
1766 
1767 #ifdef CONFIG_TCP_MD5SIG
1768 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1769 	/* Copy over the MD5 key from the original socket */
1770 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1771 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1772 	if (key && !tcp_rsk_used_ao(req)) {
1773 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1774 			goto put_and_exit;
1775 		sk_gso_disable(newsk);
1776 	}
1777 #endif
1778 #ifdef CONFIG_TCP_AO
1779 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1780 		goto put_and_exit; /* OOM, release back memory */
1781 #endif
1782 
1783 	if (__inet_inherit_port(sk, newsk) < 0)
1784 		goto put_and_exit;
1785 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1786 				       &found_dup_sk);
1787 	if (likely(*own_req)) {
1788 		tcp_move_syn(newtp, req);
1789 		ireq->ireq_opt = NULL;
1790 	} else {
1791 		newinet->inet_opt = NULL;
1792 
1793 		if (!req_unhash && found_dup_sk) {
1794 			/* This code path should only be executed in the
1795 			 * syncookie case only
1796 			 */
1797 			bh_unlock_sock(newsk);
1798 			sock_put(newsk);
1799 			newsk = NULL;
1800 		}
1801 	}
1802 	return newsk;
1803 
1804 exit_overflow:
1805 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1806 exit_nonewsk:
1807 	dst_release(dst);
1808 exit:
1809 	tcp_listendrop(sk);
1810 	return NULL;
1811 put_and_exit:
1812 	newinet->inet_opt = NULL;
1813 	inet_csk_prepare_forced_close(newsk);
1814 	tcp_done(newsk);
1815 	goto exit;
1816 }
1817 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1818 
1819 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1820 {
1821 #ifdef CONFIG_SYN_COOKIES
1822 	const struct tcphdr *th = tcp_hdr(skb);
1823 
1824 	if (!th->syn)
1825 		sk = cookie_v4_check(sk, skb);
1826 #endif
1827 	return sk;
1828 }
1829 
1830 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1831 			 struct tcphdr *th, u32 *cookie)
1832 {
1833 	u16 mss = 0;
1834 #ifdef CONFIG_SYN_COOKIES
1835 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1836 				    &tcp_request_sock_ipv4_ops, sk, th);
1837 	if (mss) {
1838 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1839 		tcp_synq_overflow(sk);
1840 	}
1841 #endif
1842 	return mss;
1843 }
1844 
1845 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1846 							   u32));
1847 /* The socket must have it's spinlock held when we get
1848  * here, unless it is a TCP_LISTEN socket.
1849  *
1850  * We have a potential double-lock case here, so even when
1851  * doing backlog processing we use the BH locking scheme.
1852  * This is because we cannot sleep with the original spinlock
1853  * held.
1854  */
1855 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1856 {
1857 	enum skb_drop_reason reason;
1858 	struct sock *rsk;
1859 
1860 	reason = psp_sk_rx_policy_check(sk, skb);
1861 	if (reason)
1862 		goto err_discard;
1863 
1864 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1865 		struct dst_entry *dst;
1866 
1867 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1868 						lockdep_sock_is_held(sk));
1869 
1870 		sock_rps_save_rxhash(sk, skb);
1871 		sk_mark_napi_id(sk, skb);
1872 		if (dst) {
1873 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1874 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1875 					     dst, 0)) {
1876 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1877 				dst_release(dst);
1878 			}
1879 		}
1880 		tcp_rcv_established(sk, skb);
1881 		return 0;
1882 	}
1883 
1884 	if (tcp_checksum_complete(skb))
1885 		goto csum_err;
1886 
1887 	if (sk->sk_state == TCP_LISTEN) {
1888 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1889 
1890 		if (!nsk)
1891 			return 0;
1892 		if (nsk != sk) {
1893 			reason = tcp_child_process(sk, nsk, skb);
1894 			if (reason) {
1895 				rsk = nsk;
1896 				goto reset;
1897 			}
1898 			return 0;
1899 		}
1900 	} else
1901 		sock_rps_save_rxhash(sk, skb);
1902 
1903 	reason = tcp_rcv_state_process(sk, skb);
1904 	if (reason) {
1905 		rsk = sk;
1906 		goto reset;
1907 	}
1908 	return 0;
1909 
1910 reset:
1911 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1912 discard:
1913 	sk_skb_reason_drop(sk, skb, reason);
1914 	/* Be careful here. If this function gets more complicated and
1915 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1916 	 * might be destroyed here. This current version compiles correctly,
1917 	 * but you have been warned.
1918 	 */
1919 	return 0;
1920 
1921 csum_err:
1922 	reason = SKB_DROP_REASON_TCP_CSUM;
1923 	trace_tcp_bad_csum(skb);
1924 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1925 err_discard:
1926 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1927 	goto discard;
1928 }
1929 EXPORT_SYMBOL(tcp_v4_do_rcv);
1930 
1931 int tcp_v4_early_demux(struct sk_buff *skb)
1932 {
1933 	struct net *net = dev_net_rcu(skb->dev);
1934 	const struct iphdr *iph;
1935 	const struct tcphdr *th;
1936 	struct sock *sk;
1937 
1938 	if (skb->pkt_type != PACKET_HOST)
1939 		return 0;
1940 
1941 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1942 		return 0;
1943 
1944 	iph = ip_hdr(skb);
1945 	th = tcp_hdr(skb);
1946 
1947 	if (th->doff < sizeof(struct tcphdr) / 4)
1948 		return 0;
1949 
1950 	sk = __inet_lookup_established(net, iph->saddr, th->source,
1951 				       iph->daddr, ntohs(th->dest),
1952 				       skb->skb_iif, inet_sdif(skb));
1953 	if (sk) {
1954 		skb->sk = sk;
1955 		skb->destructor = sock_edemux;
1956 		if (sk_fullsock(sk)) {
1957 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1958 
1959 			if (dst)
1960 				dst = dst_check(dst, 0);
1961 			if (dst &&
1962 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1963 				skb_dst_set_noref(skb, dst);
1964 		}
1965 	}
1966 	return 0;
1967 }
1968 
1969 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1970 		     enum skb_drop_reason *reason)
1971 {
1972 	u32 tail_gso_size, tail_gso_segs;
1973 	struct skb_shared_info *shinfo;
1974 	const struct tcphdr *th;
1975 	struct tcphdr *thtail;
1976 	struct sk_buff *tail;
1977 	unsigned int hdrlen;
1978 	bool fragstolen;
1979 	u32 gso_segs;
1980 	u32 gso_size;
1981 	u64 limit;
1982 	int delta;
1983 	int err;
1984 
1985 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1986 	 * we can fix skb->truesize to its real value to avoid future drops.
1987 	 * This is valid because skb is not yet charged to the socket.
1988 	 * It has been noticed pure SACK packets were sometimes dropped
1989 	 * (if cooked by drivers without copybreak feature).
1990 	 */
1991 	skb_condense(skb);
1992 
1993 	tcp_cleanup_skb(skb);
1994 
1995 	if (unlikely(tcp_checksum_complete(skb))) {
1996 		bh_unlock_sock(sk);
1997 		trace_tcp_bad_csum(skb);
1998 		*reason = SKB_DROP_REASON_TCP_CSUM;
1999 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2000 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2001 		return true;
2002 	}
2003 
2004 	/* Attempt coalescing to last skb in backlog, even if we are
2005 	 * above the limits.
2006 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2007 	 */
2008 	th = (const struct tcphdr *)skb->data;
2009 	hdrlen = th->doff * 4;
2010 
2011 	tail = sk->sk_backlog.tail;
2012 	if (!tail)
2013 		goto no_coalesce;
2014 	thtail = (struct tcphdr *)tail->data;
2015 
2016 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2017 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2018 	    ((TCP_SKB_CB(tail)->tcp_flags |
2019 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2020 	    !((TCP_SKB_CB(tail)->tcp_flags &
2021 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2022 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2023 	      TCP_SKB_CB(skb)->tcp_flags) &
2024 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2025 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2026 	    thtail->doff != th->doff ||
2027 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
2028 	    /* prior to PSP Rx policy check, retain exact PSP metadata */
2029 	    psp_skb_coalesce_diff(tail, skb))
2030 		goto no_coalesce;
2031 
2032 	__skb_pull(skb, hdrlen);
2033 
2034 	shinfo = skb_shinfo(skb);
2035 	gso_size = shinfo->gso_size ?: skb->len;
2036 	gso_segs = shinfo->gso_segs ?: 1;
2037 
2038 	shinfo = skb_shinfo(tail);
2039 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2040 	tail_gso_segs = shinfo->gso_segs ?: 1;
2041 
2042 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2043 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2044 
2045 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2046 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2047 			thtail->window = th->window;
2048 		}
2049 
2050 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2051 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2052 		 * is not entered if we append a packet with a FIN.
2053 		 * SYN, RST, URG are not present.
2054 		 * ACK is set on both packets.
2055 		 * PSH : we do not really care in TCP stack,
2056 		 *       at least for 'GRO' packets.
2057 		 */
2058 		thtail->fin |= th->fin;
2059 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2060 
2061 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2062 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2063 			tail->tstamp = skb->tstamp;
2064 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2065 		}
2066 
2067 		/* Not as strict as GRO. We only need to carry mss max value */
2068 		shinfo->gso_size = max(gso_size, tail_gso_size);
2069 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2070 
2071 		sk->sk_backlog.len += delta;
2072 		__NET_INC_STATS(sock_net(sk),
2073 				LINUX_MIB_TCPBACKLOGCOALESCE);
2074 		kfree_skb_partial(skb, fragstolen);
2075 		return false;
2076 	}
2077 	__skb_push(skb, hdrlen);
2078 
2079 no_coalesce:
2080 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2081 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2082 	 * sk_rcvbuf in normal conditions.
2083 	 */
2084 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2085 
2086 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2087 
2088 	/* Only socket owner can try to collapse/prune rx queues
2089 	 * to reduce memory overhead, so add a little headroom here.
2090 	 * Few sockets backlog are possibly concurrently non empty.
2091 	 */
2092 	limit += 64 * 1024;
2093 
2094 	limit = min_t(u64, limit, UINT_MAX);
2095 
2096 	err = sk_add_backlog(sk, skb, limit);
2097 	if (unlikely(err)) {
2098 		bh_unlock_sock(sk);
2099 		if (err == -ENOMEM) {
2100 			*reason = SKB_DROP_REASON_PFMEMALLOC;
2101 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2102 		} else {
2103 			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2104 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2105 		}
2106 		return true;
2107 	}
2108 	return false;
2109 }
2110 EXPORT_IPV6_MOD(tcp_add_backlog);
2111 
2112 static void tcp_v4_restore_cb(struct sk_buff *skb)
2113 {
2114 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2115 		sizeof(struct inet_skb_parm));
2116 }
2117 
2118 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2119 			   const struct tcphdr *th)
2120 {
2121 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2122 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2123 	 */
2124 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2125 		sizeof(struct inet_skb_parm));
2126 	barrier();
2127 
2128 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2129 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2130 				    skb->len - th->doff * 4);
2131 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2132 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2133 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2134 	TCP_SKB_CB(skb)->sacked	 = 0;
2135 	TCP_SKB_CB(skb)->has_rxtstamp =
2136 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2137 }
2138 
2139 /*
2140  *	From tcp_input.c
2141  */
2142 
2143 int tcp_v4_rcv(struct sk_buff *skb)
2144 {
2145 	struct net *net = dev_net_rcu(skb->dev);
2146 	enum skb_drop_reason drop_reason;
2147 	enum tcp_tw_status tw_status;
2148 	int sdif = inet_sdif(skb);
2149 	int dif = inet_iif(skb);
2150 	const struct iphdr *iph;
2151 	const struct tcphdr *th;
2152 	struct sock *sk = NULL;
2153 	bool refcounted;
2154 	int ret;
2155 	u32 isn;
2156 
2157 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2158 	if (skb->pkt_type != PACKET_HOST)
2159 		goto discard_it;
2160 
2161 	/* Count it even if it's bad */
2162 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2163 
2164 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2165 		goto discard_it;
2166 
2167 	th = (const struct tcphdr *)skb->data;
2168 
2169 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2170 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2171 		goto bad_packet;
2172 	}
2173 	if (!pskb_may_pull(skb, th->doff * 4))
2174 		goto discard_it;
2175 
2176 	/* An explanation is required here, I think.
2177 	 * Packet length and doff are validated by header prediction,
2178 	 * provided case of th->doff==0 is eliminated.
2179 	 * So, we defer the checks. */
2180 
2181 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2182 		goto csum_error;
2183 
2184 	th = (const struct tcphdr *)skb->data;
2185 	iph = ip_hdr(skb);
2186 lookup:
2187 	sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2188 			       th->dest, sdif, &refcounted);
2189 	if (!sk)
2190 		goto no_tcp_socket;
2191 
2192 	if (sk->sk_state == TCP_TIME_WAIT)
2193 		goto do_time_wait;
2194 
2195 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2196 		struct request_sock *req = inet_reqsk(sk);
2197 		bool req_stolen = false;
2198 		struct sock *nsk;
2199 
2200 		sk = req->rsk_listener;
2201 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2202 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2203 		else
2204 			drop_reason = tcp_inbound_hash(sk, req, skb,
2205 						       &iph->saddr, &iph->daddr,
2206 						       AF_INET, dif, sdif);
2207 		if (unlikely(drop_reason)) {
2208 			sk_drops_skbadd(sk, skb);
2209 			reqsk_put(req);
2210 			goto discard_it;
2211 		}
2212 		if (tcp_checksum_complete(skb)) {
2213 			reqsk_put(req);
2214 			goto csum_error;
2215 		}
2216 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2217 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2218 			if (!nsk) {
2219 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2220 				goto lookup;
2221 			}
2222 			sk = nsk;
2223 			/* reuseport_migrate_sock() has already held one sk_refcnt
2224 			 * before returning.
2225 			 */
2226 		} else {
2227 			/* We own a reference on the listener, increase it again
2228 			 * as we might lose it too soon.
2229 			 */
2230 			sock_hold(sk);
2231 		}
2232 		refcounted = true;
2233 		nsk = NULL;
2234 		if (!tcp_filter(sk, skb, &drop_reason)) {
2235 			th = (const struct tcphdr *)skb->data;
2236 			iph = ip_hdr(skb);
2237 			tcp_v4_fill_cb(skb, iph, th);
2238 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2239 					    &drop_reason);
2240 		}
2241 		if (!nsk) {
2242 			reqsk_put(req);
2243 			if (req_stolen) {
2244 				/* Another cpu got exclusive access to req
2245 				 * and created a full blown socket.
2246 				 * Try to feed this packet to this socket
2247 				 * instead of discarding it.
2248 				 */
2249 				tcp_v4_restore_cb(skb);
2250 				sock_put(sk);
2251 				goto lookup;
2252 			}
2253 			goto discard_and_relse;
2254 		}
2255 		nf_reset_ct(skb);
2256 		if (nsk == sk) {
2257 			reqsk_put(req);
2258 			tcp_v4_restore_cb(skb);
2259 		} else {
2260 			drop_reason = tcp_child_process(sk, nsk, skb);
2261 			if (drop_reason) {
2262 				enum sk_rst_reason rst_reason;
2263 
2264 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2265 				tcp_v4_send_reset(nsk, skb, rst_reason);
2266 				goto discard_and_relse;
2267 			}
2268 			sock_put(sk);
2269 			return 0;
2270 		}
2271 	}
2272 
2273 process:
2274 	if (static_branch_unlikely(&ip4_min_ttl)) {
2275 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2276 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2277 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2278 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2279 			goto discard_and_relse;
2280 		}
2281 	}
2282 
2283 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2284 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2285 		goto discard_and_relse;
2286 	}
2287 
2288 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2289 				       AF_INET, dif, sdif);
2290 	if (drop_reason)
2291 		goto discard_and_relse;
2292 
2293 	nf_reset_ct(skb);
2294 
2295 	if (tcp_filter(sk, skb, &drop_reason))
2296 		goto discard_and_relse;
2297 
2298 	th = (const struct tcphdr *)skb->data;
2299 	iph = ip_hdr(skb);
2300 	tcp_v4_fill_cb(skb, iph, th);
2301 
2302 	skb->dev = NULL;
2303 
2304 	if (sk->sk_state == TCP_LISTEN) {
2305 		ret = tcp_v4_do_rcv(sk, skb);
2306 		goto put_and_return;
2307 	}
2308 
2309 	sk_incoming_cpu_update(sk);
2310 
2311 	bh_lock_sock_nested(sk);
2312 	tcp_segs_in(tcp_sk(sk), skb);
2313 	ret = 0;
2314 	if (!sock_owned_by_user(sk)) {
2315 		ret = tcp_v4_do_rcv(sk, skb);
2316 	} else {
2317 		if (tcp_add_backlog(sk, skb, &drop_reason))
2318 			goto discard_and_relse;
2319 	}
2320 	bh_unlock_sock(sk);
2321 
2322 put_and_return:
2323 	if (refcounted)
2324 		sock_put(sk);
2325 
2326 	return ret;
2327 
2328 no_tcp_socket:
2329 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2330 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2331 		goto discard_it;
2332 
2333 	tcp_v4_fill_cb(skb, iph, th);
2334 
2335 	if (tcp_checksum_complete(skb)) {
2336 csum_error:
2337 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2338 		trace_tcp_bad_csum(skb);
2339 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2340 bad_packet:
2341 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2342 	} else {
2343 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2344 	}
2345 
2346 discard_it:
2347 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2348 	/* Discard frame. */
2349 	sk_skb_reason_drop(sk, skb, drop_reason);
2350 	return 0;
2351 
2352 discard_and_relse:
2353 	sk_drops_skbadd(sk, skb);
2354 	if (refcounted)
2355 		sock_put(sk);
2356 	goto discard_it;
2357 
2358 do_time_wait:
2359 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2360 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2361 		inet_twsk_put(inet_twsk(sk));
2362 		goto discard_it;
2363 	}
2364 
2365 	tcp_v4_fill_cb(skb, iph, th);
2366 
2367 	if (tcp_checksum_complete(skb)) {
2368 		inet_twsk_put(inet_twsk(sk));
2369 		goto csum_error;
2370 	}
2371 
2372 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2373 					       &drop_reason);
2374 	switch (tw_status) {
2375 	case TCP_TW_SYN: {
2376 		struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2377 							iph->saddr, th->source,
2378 							iph->daddr, th->dest,
2379 							inet_iif(skb),
2380 							sdif);
2381 		if (sk2) {
2382 			inet_twsk_deschedule_put(inet_twsk(sk));
2383 			sk = sk2;
2384 			tcp_v4_restore_cb(skb);
2385 			refcounted = false;
2386 			__this_cpu_write(tcp_tw_isn, isn);
2387 			goto process;
2388 		}
2389 
2390 		drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2391 		if (drop_reason)
2392 			break;
2393 	}
2394 		/* to ACK */
2395 		fallthrough;
2396 	case TCP_TW_ACK:
2397 	case TCP_TW_ACK_OOW:
2398 		tcp_v4_timewait_ack(sk, skb, tw_status);
2399 		break;
2400 	case TCP_TW_RST:
2401 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2402 		inet_twsk_deschedule_put(inet_twsk(sk));
2403 		goto discard_it;
2404 	case TCP_TW_SUCCESS:;
2405 	}
2406 	goto discard_it;
2407 }
2408 
2409 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2410 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2411 };
2412 
2413 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2414 {
2415 	struct dst_entry *dst = skb_dst(skb);
2416 
2417 	if (dst && dst_hold_safe(dst)) {
2418 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2419 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2420 	}
2421 }
2422 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2423 
2424 const struct inet_connection_sock_af_ops ipv4_specific = {
2425 	.queue_xmit	   = ip_queue_xmit,
2426 	.send_check	   = tcp_v4_send_check,
2427 	.rebuild_header	   = inet_sk_rebuild_header,
2428 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2429 	.conn_request	   = tcp_v4_conn_request,
2430 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2431 	.net_header_len	   = sizeof(struct iphdr),
2432 	.setsockopt	   = ip_setsockopt,
2433 	.getsockopt	   = ip_getsockopt,
2434 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2435 };
2436 EXPORT_IPV6_MOD(ipv4_specific);
2437 
2438 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2439 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2440 #ifdef CONFIG_TCP_MD5SIG
2441 	.md5_lookup		= tcp_v4_md5_lookup,
2442 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2443 	.md5_parse		= tcp_v4_parse_md5_keys,
2444 #endif
2445 #ifdef CONFIG_TCP_AO
2446 	.ao_lookup		= tcp_v4_ao_lookup,
2447 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2448 	.ao_parse		= tcp_v4_parse_ao,
2449 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2450 #endif
2451 };
2452 
2453 static void tcp4_destruct_sock(struct sock *sk)
2454 {
2455 	tcp_md5_destruct_sock(sk);
2456 	tcp_ao_destroy_sock(sk, false);
2457 	inet_sock_destruct(sk);
2458 }
2459 #endif
2460 
2461 /* NOTE: A lot of things set to zero explicitly by call to
2462  *       sk_alloc() so need not be done here.
2463  */
2464 static int tcp_v4_init_sock(struct sock *sk)
2465 {
2466 	struct inet_connection_sock *icsk = inet_csk(sk);
2467 
2468 	tcp_init_sock(sk);
2469 
2470 	icsk->icsk_af_ops = &ipv4_specific;
2471 
2472 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2473 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2474 	sk->sk_destruct = tcp4_destruct_sock;
2475 #endif
2476 
2477 	return 0;
2478 }
2479 
2480 static void tcp_release_user_frags(struct sock *sk)
2481 {
2482 #ifdef CONFIG_PAGE_POOL
2483 	unsigned long index;
2484 	void *netmem;
2485 
2486 	xa_for_each(&sk->sk_user_frags, index, netmem)
2487 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2488 #endif
2489 }
2490 
2491 void tcp_v4_destroy_sock(struct sock *sk)
2492 {
2493 	struct tcp_sock *tp = tcp_sk(sk);
2494 
2495 	tcp_release_user_frags(sk);
2496 
2497 	xa_destroy(&sk->sk_user_frags);
2498 
2499 	trace_tcp_destroy_sock(sk);
2500 
2501 	tcp_clear_xmit_timers(sk);
2502 
2503 	tcp_cleanup_congestion_control(sk);
2504 
2505 	tcp_cleanup_ulp(sk);
2506 
2507 	/* Cleanup up the write buffer. */
2508 	tcp_write_queue_purge(sk);
2509 
2510 	/* Check if we want to disable active TFO */
2511 	tcp_fastopen_active_disable_ofo_check(sk);
2512 
2513 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2514 	skb_rbtree_purge(&tp->out_of_order_queue);
2515 
2516 	/* Clean up a referenced TCP bind bucket. */
2517 	if (inet_csk(sk)->icsk_bind_hash)
2518 		inet_put_port(sk);
2519 
2520 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2521 
2522 	/* If socket is aborted during connect operation */
2523 	tcp_free_fastopen_req(tp);
2524 	tcp_fastopen_destroy_cipher(sk);
2525 	tcp_saved_syn_free(tp);
2526 
2527 	sk_sockets_allocated_dec(sk);
2528 }
2529 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2530 
2531 #ifdef CONFIG_PROC_FS
2532 /* Proc filesystem TCP sock list dumping. */
2533 
2534 static unsigned short seq_file_family(const struct seq_file *seq);
2535 
2536 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2537 {
2538 	unsigned short family = seq_file_family(seq);
2539 
2540 	/* AF_UNSPEC is used as a match all */
2541 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2542 		net_eq(sock_net(sk), seq_file_net(seq)));
2543 }
2544 
2545 /* Find a non empty bucket (starting from st->bucket)
2546  * and return the first sk from it.
2547  */
2548 static void *listening_get_first(struct seq_file *seq)
2549 {
2550 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2551 	struct tcp_iter_state *st = seq->private;
2552 
2553 	st->offset = 0;
2554 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2555 		struct inet_listen_hashbucket *ilb2;
2556 		struct hlist_nulls_node *node;
2557 		struct sock *sk;
2558 
2559 		ilb2 = &hinfo->lhash2[st->bucket];
2560 		if (hlist_nulls_empty(&ilb2->nulls_head))
2561 			continue;
2562 
2563 		spin_lock(&ilb2->lock);
2564 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2565 			if (seq_sk_match(seq, sk))
2566 				return sk;
2567 		}
2568 		spin_unlock(&ilb2->lock);
2569 	}
2570 
2571 	return NULL;
2572 }
2573 
2574 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2575  * If "cur" is the last one in the st->bucket,
2576  * call listening_get_first() to return the first sk of the next
2577  * non empty bucket.
2578  */
2579 static void *listening_get_next(struct seq_file *seq, void *cur)
2580 {
2581 	struct tcp_iter_state *st = seq->private;
2582 	struct inet_listen_hashbucket *ilb2;
2583 	struct hlist_nulls_node *node;
2584 	struct inet_hashinfo *hinfo;
2585 	struct sock *sk = cur;
2586 
2587 	++st->num;
2588 	++st->offset;
2589 
2590 	sk = sk_nulls_next(sk);
2591 	sk_nulls_for_each_from(sk, node) {
2592 		if (seq_sk_match(seq, sk))
2593 			return sk;
2594 	}
2595 
2596 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2597 	ilb2 = &hinfo->lhash2[st->bucket];
2598 	spin_unlock(&ilb2->lock);
2599 	++st->bucket;
2600 	return listening_get_first(seq);
2601 }
2602 
2603 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2604 {
2605 	struct tcp_iter_state *st = seq->private;
2606 	void *rc;
2607 
2608 	st->bucket = 0;
2609 	st->offset = 0;
2610 	rc = listening_get_first(seq);
2611 
2612 	while (rc && *pos) {
2613 		rc = listening_get_next(seq, rc);
2614 		--*pos;
2615 	}
2616 	return rc;
2617 }
2618 
2619 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2620 				const struct tcp_iter_state *st)
2621 {
2622 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2623 }
2624 
2625 /*
2626  * Get first established socket starting from bucket given in st->bucket.
2627  * If st->bucket is zero, the very first socket in the hash is returned.
2628  */
2629 static void *established_get_first(struct seq_file *seq)
2630 {
2631 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2632 	struct tcp_iter_state *st = seq->private;
2633 
2634 	st->offset = 0;
2635 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2636 		struct sock *sk;
2637 		struct hlist_nulls_node *node;
2638 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2639 
2640 		cond_resched();
2641 
2642 		/* Lockless fast path for the common case of empty buckets */
2643 		if (empty_bucket(hinfo, st))
2644 			continue;
2645 
2646 		spin_lock_bh(lock);
2647 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2648 			if (seq_sk_match(seq, sk))
2649 				return sk;
2650 		}
2651 		spin_unlock_bh(lock);
2652 	}
2653 
2654 	return NULL;
2655 }
2656 
2657 static void *established_get_next(struct seq_file *seq, void *cur)
2658 {
2659 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2660 	struct tcp_iter_state *st = seq->private;
2661 	struct hlist_nulls_node *node;
2662 	struct sock *sk = cur;
2663 
2664 	++st->num;
2665 	++st->offset;
2666 
2667 	sk = sk_nulls_next(sk);
2668 
2669 	sk_nulls_for_each_from(sk, node) {
2670 		if (seq_sk_match(seq, sk))
2671 			return sk;
2672 	}
2673 
2674 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2675 	++st->bucket;
2676 	return established_get_first(seq);
2677 }
2678 
2679 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2680 {
2681 	struct tcp_iter_state *st = seq->private;
2682 	void *rc;
2683 
2684 	st->bucket = 0;
2685 	rc = established_get_first(seq);
2686 
2687 	while (rc && pos) {
2688 		rc = established_get_next(seq, rc);
2689 		--pos;
2690 	}
2691 	return rc;
2692 }
2693 
2694 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2695 {
2696 	void *rc;
2697 	struct tcp_iter_state *st = seq->private;
2698 
2699 	st->state = TCP_SEQ_STATE_LISTENING;
2700 	rc	  = listening_get_idx(seq, &pos);
2701 
2702 	if (!rc) {
2703 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2704 		rc	  = established_get_idx(seq, pos);
2705 	}
2706 
2707 	return rc;
2708 }
2709 
2710 static void *tcp_seek_last_pos(struct seq_file *seq)
2711 {
2712 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2713 	struct tcp_iter_state *st = seq->private;
2714 	int bucket = st->bucket;
2715 	int offset = st->offset;
2716 	int orig_num = st->num;
2717 	void *rc = NULL;
2718 
2719 	switch (st->state) {
2720 	case TCP_SEQ_STATE_LISTENING:
2721 		if (st->bucket > hinfo->lhash2_mask)
2722 			break;
2723 		rc = listening_get_first(seq);
2724 		while (offset-- && rc && bucket == st->bucket)
2725 			rc = listening_get_next(seq, rc);
2726 		if (rc)
2727 			break;
2728 		st->bucket = 0;
2729 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2730 		fallthrough;
2731 	case TCP_SEQ_STATE_ESTABLISHED:
2732 		if (st->bucket > hinfo->ehash_mask)
2733 			break;
2734 		rc = established_get_first(seq);
2735 		while (offset-- && rc && bucket == st->bucket)
2736 			rc = established_get_next(seq, rc);
2737 	}
2738 
2739 	st->num = orig_num;
2740 
2741 	return rc;
2742 }
2743 
2744 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2745 {
2746 	struct tcp_iter_state *st = seq->private;
2747 	void *rc;
2748 
2749 	if (*pos && *pos == st->last_pos) {
2750 		rc = tcp_seek_last_pos(seq);
2751 		if (rc)
2752 			goto out;
2753 	}
2754 
2755 	st->state = TCP_SEQ_STATE_LISTENING;
2756 	st->num = 0;
2757 	st->bucket = 0;
2758 	st->offset = 0;
2759 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2760 
2761 out:
2762 	st->last_pos = *pos;
2763 	return rc;
2764 }
2765 EXPORT_IPV6_MOD(tcp_seq_start);
2766 
2767 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2768 {
2769 	struct tcp_iter_state *st = seq->private;
2770 	void *rc = NULL;
2771 
2772 	if (v == SEQ_START_TOKEN) {
2773 		rc = tcp_get_idx(seq, 0);
2774 		goto out;
2775 	}
2776 
2777 	switch (st->state) {
2778 	case TCP_SEQ_STATE_LISTENING:
2779 		rc = listening_get_next(seq, v);
2780 		if (!rc) {
2781 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2782 			st->bucket = 0;
2783 			st->offset = 0;
2784 			rc	  = established_get_first(seq);
2785 		}
2786 		break;
2787 	case TCP_SEQ_STATE_ESTABLISHED:
2788 		rc = established_get_next(seq, v);
2789 		break;
2790 	}
2791 out:
2792 	++*pos;
2793 	st->last_pos = *pos;
2794 	return rc;
2795 }
2796 EXPORT_IPV6_MOD(tcp_seq_next);
2797 
2798 void tcp_seq_stop(struct seq_file *seq, void *v)
2799 {
2800 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2801 	struct tcp_iter_state *st = seq->private;
2802 
2803 	switch (st->state) {
2804 	case TCP_SEQ_STATE_LISTENING:
2805 		if (v != SEQ_START_TOKEN)
2806 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2807 		break;
2808 	case TCP_SEQ_STATE_ESTABLISHED:
2809 		if (v)
2810 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2811 		break;
2812 	}
2813 }
2814 EXPORT_IPV6_MOD(tcp_seq_stop);
2815 
2816 static void get_openreq4(const struct request_sock *req,
2817 			 struct seq_file *f, int i)
2818 {
2819 	const struct inet_request_sock *ireq = inet_rsk(req);
2820 	long delta = req->rsk_timer.expires - jiffies;
2821 
2822 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2823 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2824 		i,
2825 		ireq->ir_loc_addr,
2826 		ireq->ir_num,
2827 		ireq->ir_rmt_addr,
2828 		ntohs(ireq->ir_rmt_port),
2829 		TCP_SYN_RECV,
2830 		0, 0, /* could print option size, but that is af dependent. */
2831 		1,    /* timers active (only the expire timer) */
2832 		jiffies_delta_to_clock_t(delta),
2833 		req->num_timeout,
2834 		from_kuid_munged(seq_user_ns(f),
2835 				 sk_uid(req->rsk_listener)),
2836 		0,  /* non standard timer */
2837 		0, /* open_requests have no inode */
2838 		0,
2839 		req);
2840 }
2841 
2842 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2843 {
2844 	int timer_active;
2845 	unsigned long timer_expires;
2846 	const struct tcp_sock *tp = tcp_sk(sk);
2847 	const struct inet_connection_sock *icsk = inet_csk(sk);
2848 	const struct inet_sock *inet = inet_sk(sk);
2849 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2850 	__be32 dest = inet->inet_daddr;
2851 	__be32 src = inet->inet_rcv_saddr;
2852 	__u16 destp = ntohs(inet->inet_dport);
2853 	__u16 srcp = ntohs(inet->inet_sport);
2854 	u8 icsk_pending;
2855 	int rx_queue;
2856 	int state;
2857 
2858 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2859 	if (icsk_pending == ICSK_TIME_RETRANS ||
2860 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2861 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2862 		timer_active	= 1;
2863 		timer_expires	= tcp_timeout_expires(sk);
2864 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2865 		timer_active	= 4;
2866 		timer_expires	= tcp_timeout_expires(sk);
2867 	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
2868 		timer_active	= 2;
2869 		timer_expires	= icsk->icsk_keepalive_timer.expires;
2870 	} else {
2871 		timer_active	= 0;
2872 		timer_expires = jiffies;
2873 	}
2874 
2875 	state = inet_sk_state_load(sk);
2876 	if (state == TCP_LISTEN)
2877 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2878 	else
2879 		/* Because we don't lock the socket,
2880 		 * we might find a transient negative value.
2881 		 */
2882 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2883 				      READ_ONCE(tp->copied_seq), 0);
2884 
2885 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2886 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2887 		i, src, srcp, dest, destp, state,
2888 		READ_ONCE(tp->write_seq) - tp->snd_una,
2889 		rx_queue,
2890 		timer_active,
2891 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2892 		READ_ONCE(icsk->icsk_retransmits),
2893 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2894 		READ_ONCE(icsk->icsk_probes_out),
2895 		sock_i_ino(sk),
2896 		refcount_read(&sk->sk_refcnt), sk,
2897 		jiffies_to_clock_t(icsk->icsk_rto),
2898 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2899 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2900 		tcp_snd_cwnd(tp),
2901 		state == TCP_LISTEN ?
2902 		    fastopenq->max_qlen :
2903 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2904 }
2905 
2906 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2907 			       struct seq_file *f, int i)
2908 {
2909 	long delta = tw->tw_timer.expires - jiffies;
2910 	__be32 dest, src;
2911 	__u16 destp, srcp;
2912 
2913 	dest  = tw->tw_daddr;
2914 	src   = tw->tw_rcv_saddr;
2915 	destp = ntohs(tw->tw_dport);
2916 	srcp  = ntohs(tw->tw_sport);
2917 
2918 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2919 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2920 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2921 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2922 		refcount_read(&tw->tw_refcnt), tw);
2923 }
2924 
2925 #define TMPSZ 150
2926 
2927 static int tcp4_seq_show(struct seq_file *seq, void *v)
2928 {
2929 	struct tcp_iter_state *st;
2930 	struct sock *sk = v;
2931 
2932 	seq_setwidth(seq, TMPSZ - 1);
2933 	if (v == SEQ_START_TOKEN) {
2934 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2935 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2936 			   "inode");
2937 		goto out;
2938 	}
2939 	st = seq->private;
2940 
2941 	if (sk->sk_state == TCP_TIME_WAIT)
2942 		get_timewait4_sock(v, seq, st->num);
2943 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2944 		get_openreq4(v, seq, st->num);
2945 	else
2946 		get_tcp4_sock(v, seq, st->num);
2947 out:
2948 	seq_pad(seq, '\n');
2949 	return 0;
2950 }
2951 
2952 #ifdef CONFIG_BPF_SYSCALL
2953 union bpf_tcp_iter_batch_item {
2954 	struct sock *sk;
2955 	__u64 cookie;
2956 };
2957 
2958 struct bpf_tcp_iter_state {
2959 	struct tcp_iter_state state;
2960 	unsigned int cur_sk;
2961 	unsigned int end_sk;
2962 	unsigned int max_sk;
2963 	union bpf_tcp_iter_batch_item *batch;
2964 };
2965 
2966 struct bpf_iter__tcp {
2967 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2968 	__bpf_md_ptr(struct sock_common *, sk_common);
2969 	uid_t uid __aligned(8);
2970 };
2971 
2972 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2973 			     struct sock_common *sk_common, uid_t uid)
2974 {
2975 	struct bpf_iter__tcp ctx;
2976 
2977 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2978 	ctx.meta = meta;
2979 	ctx.sk_common = sk_common;
2980 	ctx.uid = uid;
2981 	return bpf_iter_run_prog(prog, &ctx);
2982 }
2983 
2984 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2985 {
2986 	union bpf_tcp_iter_batch_item *item;
2987 	unsigned int cur_sk = iter->cur_sk;
2988 	__u64 cookie;
2989 
2990 	/* Remember the cookies of the sockets we haven't seen yet, so we can
2991 	 * pick up where we left off next time around.
2992 	 */
2993 	while (cur_sk < iter->end_sk) {
2994 		item = &iter->batch[cur_sk++];
2995 		cookie = sock_gen_cookie(item->sk);
2996 		sock_gen_put(item->sk);
2997 		item->cookie = cookie;
2998 	}
2999 }
3000 
3001 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3002 				      unsigned int new_batch_sz, gfp_t flags)
3003 {
3004 	union bpf_tcp_iter_batch_item *new_batch;
3005 
3006 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3007 			     flags | __GFP_NOWARN);
3008 	if (!new_batch)
3009 		return -ENOMEM;
3010 
3011 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3012 	kvfree(iter->batch);
3013 	iter->batch = new_batch;
3014 	iter->max_sk = new_batch_sz;
3015 
3016 	return 0;
3017 }
3018 
3019 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3020 					       union bpf_tcp_iter_batch_item *cookies,
3021 					       int n_cookies)
3022 {
3023 	struct hlist_nulls_node *node;
3024 	struct sock *sk;
3025 	int i;
3026 
3027 	for (i = 0; i < n_cookies; i++) {
3028 		sk = first_sk;
3029 		sk_nulls_for_each_from(sk, node)
3030 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3031 				return sk;
3032 	}
3033 
3034 	return NULL;
3035 }
3036 
3037 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3038 {
3039 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3040 	struct bpf_tcp_iter_state *iter = seq->private;
3041 	struct tcp_iter_state *st = &iter->state;
3042 	unsigned int find_cookie = iter->cur_sk;
3043 	unsigned int end_cookie = iter->end_sk;
3044 	int resume_bucket = st->bucket;
3045 	struct sock *sk;
3046 
3047 	if (end_cookie && find_cookie == end_cookie)
3048 		++st->bucket;
3049 
3050 	sk = listening_get_first(seq);
3051 	iter->cur_sk = 0;
3052 	iter->end_sk = 0;
3053 
3054 	if (sk && st->bucket == resume_bucket && end_cookie) {
3055 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3056 						end_cookie - find_cookie);
3057 		if (!sk) {
3058 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
3059 			++st->bucket;
3060 			sk = listening_get_first(seq);
3061 		}
3062 	}
3063 
3064 	return sk;
3065 }
3066 
3067 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3068 {
3069 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3070 	struct bpf_tcp_iter_state *iter = seq->private;
3071 	struct tcp_iter_state *st = &iter->state;
3072 	unsigned int find_cookie = iter->cur_sk;
3073 	unsigned int end_cookie = iter->end_sk;
3074 	int resume_bucket = st->bucket;
3075 	struct sock *sk;
3076 
3077 	if (end_cookie && find_cookie == end_cookie)
3078 		++st->bucket;
3079 
3080 	sk = established_get_first(seq);
3081 	iter->cur_sk = 0;
3082 	iter->end_sk = 0;
3083 
3084 	if (sk && st->bucket == resume_bucket && end_cookie) {
3085 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3086 						end_cookie - find_cookie);
3087 		if (!sk) {
3088 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3089 			++st->bucket;
3090 			sk = established_get_first(seq);
3091 		}
3092 	}
3093 
3094 	return sk;
3095 }
3096 
3097 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3098 {
3099 	struct bpf_tcp_iter_state *iter = seq->private;
3100 	struct tcp_iter_state *st = &iter->state;
3101 	struct sock *sk = NULL;
3102 
3103 	switch (st->state) {
3104 	case TCP_SEQ_STATE_LISTENING:
3105 		sk = bpf_iter_tcp_resume_listening(seq);
3106 		if (sk)
3107 			break;
3108 		st->bucket = 0;
3109 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3110 		fallthrough;
3111 	case TCP_SEQ_STATE_ESTABLISHED:
3112 		sk = bpf_iter_tcp_resume_established(seq);
3113 		break;
3114 	}
3115 
3116 	return sk;
3117 }
3118 
3119 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3120 						 struct sock **start_sk)
3121 {
3122 	struct bpf_tcp_iter_state *iter = seq->private;
3123 	struct hlist_nulls_node *node;
3124 	unsigned int expected = 1;
3125 	struct sock *sk;
3126 
3127 	sock_hold(*start_sk);
3128 	iter->batch[iter->end_sk++].sk = *start_sk;
3129 
3130 	sk = sk_nulls_next(*start_sk);
3131 	*start_sk = NULL;
3132 	sk_nulls_for_each_from(sk, node) {
3133 		if (seq_sk_match(seq, sk)) {
3134 			if (iter->end_sk < iter->max_sk) {
3135 				sock_hold(sk);
3136 				iter->batch[iter->end_sk++].sk = sk;
3137 			} else if (!*start_sk) {
3138 				/* Remember where we left off. */
3139 				*start_sk = sk;
3140 			}
3141 			expected++;
3142 		}
3143 	}
3144 
3145 	return expected;
3146 }
3147 
3148 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3149 						   struct sock **start_sk)
3150 {
3151 	struct bpf_tcp_iter_state *iter = seq->private;
3152 	struct hlist_nulls_node *node;
3153 	unsigned int expected = 1;
3154 	struct sock *sk;
3155 
3156 	sock_hold(*start_sk);
3157 	iter->batch[iter->end_sk++].sk = *start_sk;
3158 
3159 	sk = sk_nulls_next(*start_sk);
3160 	*start_sk = NULL;
3161 	sk_nulls_for_each_from(sk, node) {
3162 		if (seq_sk_match(seq, sk)) {
3163 			if (iter->end_sk < iter->max_sk) {
3164 				sock_hold(sk);
3165 				iter->batch[iter->end_sk++].sk = sk;
3166 			} else if (!*start_sk) {
3167 				/* Remember where we left off. */
3168 				*start_sk = sk;
3169 			}
3170 			expected++;
3171 		}
3172 	}
3173 
3174 	return expected;
3175 }
3176 
3177 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3178 					struct sock **start_sk)
3179 {
3180 	struct bpf_tcp_iter_state *iter = seq->private;
3181 	struct tcp_iter_state *st = &iter->state;
3182 
3183 	if (st->state == TCP_SEQ_STATE_LISTENING)
3184 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3185 	else
3186 		return bpf_iter_tcp_established_batch(seq, start_sk);
3187 }
3188 
3189 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3190 {
3191 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3192 	struct bpf_tcp_iter_state *iter = seq->private;
3193 	struct tcp_iter_state *st = &iter->state;
3194 
3195 	if (st->state == TCP_SEQ_STATE_LISTENING)
3196 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3197 	else
3198 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3199 }
3200 
3201 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3202 {
3203 	struct bpf_tcp_iter_state *iter = seq->private;
3204 	unsigned int expected;
3205 	struct sock *sk;
3206 	int err;
3207 
3208 	sk = bpf_iter_tcp_resume(seq);
3209 	if (!sk)
3210 		return NULL; /* Done */
3211 
3212 	expected = bpf_iter_fill_batch(seq, &sk);
3213 	if (likely(iter->end_sk == expected))
3214 		goto done;
3215 
3216 	/* Batch size was too small. */
3217 	bpf_iter_tcp_unlock_bucket(seq);
3218 	bpf_iter_tcp_put_batch(iter);
3219 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3220 					 GFP_USER);
3221 	if (err)
3222 		return ERR_PTR(err);
3223 
3224 	sk = bpf_iter_tcp_resume(seq);
3225 	if (!sk)
3226 		return NULL; /* Done */
3227 
3228 	expected = bpf_iter_fill_batch(seq, &sk);
3229 	if (likely(iter->end_sk == expected))
3230 		goto done;
3231 
3232 	/* Batch size was still too small. Hold onto the lock while we try
3233 	 * again with a larger batch to make sure the current bucket's size
3234 	 * does not change in the meantime.
3235 	 */
3236 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3237 	if (err) {
3238 		bpf_iter_tcp_unlock_bucket(seq);
3239 		return ERR_PTR(err);
3240 	}
3241 
3242 	expected = bpf_iter_fill_batch(seq, &sk);
3243 	WARN_ON_ONCE(iter->end_sk != expected);
3244 done:
3245 	bpf_iter_tcp_unlock_bucket(seq);
3246 	return iter->batch[0].sk;
3247 }
3248 
3249 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3250 {
3251 	/* bpf iter does not support lseek, so it always
3252 	 * continue from where it was stop()-ped.
3253 	 */
3254 	if (*pos)
3255 		return bpf_iter_tcp_batch(seq);
3256 
3257 	return SEQ_START_TOKEN;
3258 }
3259 
3260 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3261 {
3262 	struct bpf_tcp_iter_state *iter = seq->private;
3263 	struct tcp_iter_state *st = &iter->state;
3264 	struct sock *sk;
3265 
3266 	/* Whenever seq_next() is called, the iter->cur_sk is
3267 	 * done with seq_show(), so advance to the next sk in
3268 	 * the batch.
3269 	 */
3270 	if (iter->cur_sk < iter->end_sk) {
3271 		/* Keeping st->num consistent in tcp_iter_state.
3272 		 * bpf_iter_tcp does not use st->num.
3273 		 * meta.seq_num is used instead.
3274 		 */
3275 		st->num++;
3276 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3277 	}
3278 
3279 	if (iter->cur_sk < iter->end_sk)
3280 		sk = iter->batch[iter->cur_sk].sk;
3281 	else
3282 		sk = bpf_iter_tcp_batch(seq);
3283 
3284 	++*pos;
3285 	/* Keeping st->last_pos consistent in tcp_iter_state.
3286 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3287 	 */
3288 	st->last_pos = *pos;
3289 	return sk;
3290 }
3291 
3292 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3293 {
3294 	struct bpf_iter_meta meta;
3295 	struct bpf_prog *prog;
3296 	struct sock *sk = v;
3297 	uid_t uid;
3298 	int ret;
3299 
3300 	if (v == SEQ_START_TOKEN)
3301 		return 0;
3302 
3303 	if (sk_fullsock(sk))
3304 		lock_sock(sk);
3305 
3306 	if (unlikely(sk_unhashed(sk))) {
3307 		ret = SEQ_SKIP;
3308 		goto unlock;
3309 	}
3310 
3311 	if (sk->sk_state == TCP_TIME_WAIT) {
3312 		uid = 0;
3313 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3314 		const struct request_sock *req = v;
3315 
3316 		uid = from_kuid_munged(seq_user_ns(seq),
3317 				       sk_uid(req->rsk_listener));
3318 	} else {
3319 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3320 	}
3321 
3322 	meta.seq = seq;
3323 	prog = bpf_iter_get_info(&meta, false);
3324 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3325 
3326 unlock:
3327 	if (sk_fullsock(sk))
3328 		release_sock(sk);
3329 	return ret;
3330 
3331 }
3332 
3333 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3334 {
3335 	struct bpf_tcp_iter_state *iter = seq->private;
3336 	struct bpf_iter_meta meta;
3337 	struct bpf_prog *prog;
3338 
3339 	if (!v) {
3340 		meta.seq = seq;
3341 		prog = bpf_iter_get_info(&meta, true);
3342 		if (prog)
3343 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3344 	}
3345 
3346 	if (iter->cur_sk < iter->end_sk)
3347 		bpf_iter_tcp_put_batch(iter);
3348 }
3349 
3350 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3351 	.show		= bpf_iter_tcp_seq_show,
3352 	.start		= bpf_iter_tcp_seq_start,
3353 	.next		= bpf_iter_tcp_seq_next,
3354 	.stop		= bpf_iter_tcp_seq_stop,
3355 };
3356 #endif
3357 static unsigned short seq_file_family(const struct seq_file *seq)
3358 {
3359 	const struct tcp_seq_afinfo *afinfo;
3360 
3361 #ifdef CONFIG_BPF_SYSCALL
3362 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3363 	if (seq->op == &bpf_iter_tcp_seq_ops)
3364 		return AF_UNSPEC;
3365 #endif
3366 
3367 	/* Iterated from proc fs */
3368 	afinfo = pde_data(file_inode(seq->file));
3369 	return afinfo->family;
3370 }
3371 
3372 static const struct seq_operations tcp4_seq_ops = {
3373 	.show		= tcp4_seq_show,
3374 	.start		= tcp_seq_start,
3375 	.next		= tcp_seq_next,
3376 	.stop		= tcp_seq_stop,
3377 };
3378 
3379 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3380 	.family		= AF_INET,
3381 };
3382 
3383 static int __net_init tcp4_proc_init_net(struct net *net)
3384 {
3385 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3386 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3387 		return -ENOMEM;
3388 	return 0;
3389 }
3390 
3391 static void __net_exit tcp4_proc_exit_net(struct net *net)
3392 {
3393 	remove_proc_entry("tcp", net->proc_net);
3394 }
3395 
3396 static struct pernet_operations tcp4_net_ops = {
3397 	.init = tcp4_proc_init_net,
3398 	.exit = tcp4_proc_exit_net,
3399 };
3400 
3401 int __init tcp4_proc_init(void)
3402 {
3403 	return register_pernet_subsys(&tcp4_net_ops);
3404 }
3405 
3406 void tcp4_proc_exit(void)
3407 {
3408 	unregister_pernet_subsys(&tcp4_net_ops);
3409 }
3410 #endif /* CONFIG_PROC_FS */
3411 
3412 struct proto tcp_prot = {
3413 	.name			= "TCP",
3414 	.owner			= THIS_MODULE,
3415 	.close			= tcp_close,
3416 	.pre_connect		= tcp_v4_pre_connect,
3417 	.connect		= tcp_v4_connect,
3418 	.disconnect		= tcp_disconnect,
3419 	.accept			= inet_csk_accept,
3420 	.ioctl			= tcp_ioctl,
3421 	.init			= tcp_v4_init_sock,
3422 	.destroy		= tcp_v4_destroy_sock,
3423 	.shutdown		= tcp_shutdown,
3424 	.setsockopt		= tcp_setsockopt,
3425 	.getsockopt		= tcp_getsockopt,
3426 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3427 	.keepalive		= tcp_set_keepalive,
3428 	.recvmsg		= tcp_recvmsg,
3429 	.sendmsg		= tcp_sendmsg,
3430 	.splice_eof		= tcp_splice_eof,
3431 	.backlog_rcv		= tcp_v4_do_rcv,
3432 	.release_cb		= tcp_release_cb,
3433 	.hash			= inet_hash,
3434 	.unhash			= inet_unhash,
3435 	.get_port		= inet_csk_get_port,
3436 	.put_port		= inet_put_port,
3437 #ifdef CONFIG_BPF_SYSCALL
3438 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3439 #endif
3440 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3441 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3442 	.stream_memory_free	= tcp_stream_memory_free,
3443 	.sockets_allocated	= &tcp_sockets_allocated,
3444 
3445 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3446 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3447 
3448 	.memory_pressure	= &tcp_memory_pressure,
3449 	.sysctl_mem		= sysctl_tcp_mem,
3450 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3451 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3452 	.max_header		= MAX_TCP_HEADER,
3453 	.obj_size		= sizeof(struct tcp_sock),
3454 	.freeptr_offset		= offsetof(struct tcp_sock,
3455 					   inet_conn.icsk_inet.sk.sk_freeptr),
3456 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3457 	.twsk_prot		= &tcp_timewait_sock_ops,
3458 	.rsk_prot		= &tcp_request_sock_ops,
3459 	.h.hashinfo		= NULL,
3460 	.no_autobind		= true,
3461 	.diag_destroy		= tcp_abort,
3462 };
3463 EXPORT_SYMBOL(tcp_prot);
3464 
3465 static void __net_exit tcp_sk_exit(struct net *net)
3466 {
3467 	if (net->ipv4.tcp_congestion_control)
3468 		bpf_module_put(net->ipv4.tcp_congestion_control,
3469 			       net->ipv4.tcp_congestion_control->owner);
3470 }
3471 
3472 static void __net_init tcp_set_hashinfo(struct net *net)
3473 {
3474 	struct inet_hashinfo *hinfo;
3475 	unsigned int ehash_entries;
3476 	struct net *old_net;
3477 
3478 	if (net_eq(net, &init_net))
3479 		goto fallback;
3480 
3481 	old_net = current->nsproxy->net_ns;
3482 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3483 	if (!ehash_entries)
3484 		goto fallback;
3485 
3486 	ehash_entries = roundup_pow_of_two(ehash_entries);
3487 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3488 	if (!hinfo) {
3489 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3490 			"for a netns, fallback to the global one\n",
3491 			ehash_entries);
3492 fallback:
3493 		hinfo = &tcp_hashinfo;
3494 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3495 	}
3496 
3497 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3498 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3499 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3500 }
3501 
3502 static int __net_init tcp_sk_init(struct net *net)
3503 {
3504 	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3505 	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3506 	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3507 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3508 
3509 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3510 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3511 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3512 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3513 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3514 
3515 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3516 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3517 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3518 
3519 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3520 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3521 	net->ipv4.sysctl_tcp_syncookies = 1;
3522 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3523 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3524 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3525 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3526 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3527 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3528 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3529 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3530 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3531 
3532 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3533 	tcp_set_hashinfo(net);
3534 
3535 	net->ipv4.sysctl_tcp_sack = 1;
3536 	net->ipv4.sysctl_tcp_window_scaling = 1;
3537 	net->ipv4.sysctl_tcp_timestamps = 1;
3538 	net->ipv4.sysctl_tcp_early_retrans = 3;
3539 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3540 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3541 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3542 	net->ipv4.sysctl_tcp_max_reordering = 300;
3543 	net->ipv4.sysctl_tcp_dsack = 1;
3544 	net->ipv4.sysctl_tcp_app_win = 31;
3545 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3546 	net->ipv4.sysctl_tcp_frto = 2;
3547 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3548 	net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3549 	/* This limits the percentage of the congestion window which we
3550 	 * will allow a single TSO frame to consume.  Building TSO frames
3551 	 * which are too large can cause TCP streams to be bursty.
3552 	 */
3553 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3554 	/* Default TSQ limit of 4 MB */
3555 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3556 
3557 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3558 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3559 
3560 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3561 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3562 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3563 	net->ipv4.sysctl_tcp_autocorking = 1;
3564 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3565 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3566 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3567 	if (net != &init_net) {
3568 		memcpy(net->ipv4.sysctl_tcp_rmem,
3569 		       init_net.ipv4.sysctl_tcp_rmem,
3570 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3571 		memcpy(net->ipv4.sysctl_tcp_wmem,
3572 		       init_net.ipv4.sysctl_tcp_wmem,
3573 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3574 	}
3575 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3576 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
3577 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3578 	net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
3579 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3580 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3581 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3582 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3583 
3584 	/* Set default values for PLB */
3585 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3586 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3587 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3588 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3589 	/* Default congestion threshold for PLB to mark a round is 50% */
3590 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3591 
3592 	/* Reno is always built in */
3593 	if (!net_eq(net, &init_net) &&
3594 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3595 			       init_net.ipv4.tcp_congestion_control->owner))
3596 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3597 	else
3598 		net->ipv4.tcp_congestion_control = &tcp_reno;
3599 
3600 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3601 	net->ipv4.sysctl_tcp_shrink_window = 0;
3602 
3603 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3604 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3605 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3606 
3607 	return 0;
3608 }
3609 
3610 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3611 {
3612 	struct net *net;
3613 
3614 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3615 	 * and failed setup_net error unwinding path are serialized.
3616 	 *
3617 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3618 	 * net_exit_list, the thread that dismantles a particular twsk must
3619 	 * do so without other thread progressing to refcount_dec_and_test() of
3620 	 * tcp_death_row.tw_refcount.
3621 	 */
3622 	mutex_lock(&tcp_exit_batch_mutex);
3623 
3624 	tcp_twsk_purge(net_exit_list);
3625 
3626 	list_for_each_entry(net, net_exit_list, exit_list) {
3627 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3628 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3629 		tcp_fastopen_ctx_destroy(net);
3630 	}
3631 
3632 	mutex_unlock(&tcp_exit_batch_mutex);
3633 }
3634 
3635 static struct pernet_operations __net_initdata tcp_sk_ops = {
3636        .init	   = tcp_sk_init,
3637        .exit	   = tcp_sk_exit,
3638        .exit_batch = tcp_sk_exit_batch,
3639 };
3640 
3641 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3642 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3643 		     struct sock_common *sk_common, uid_t uid)
3644 
3645 #define INIT_BATCH_SZ 16
3646 
3647 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3648 {
3649 	struct bpf_tcp_iter_state *iter = priv_data;
3650 	int err;
3651 
3652 	err = bpf_iter_init_seq_net(priv_data, aux);
3653 	if (err)
3654 		return err;
3655 
3656 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3657 	if (err) {
3658 		bpf_iter_fini_seq_net(priv_data);
3659 		return err;
3660 	}
3661 
3662 	return 0;
3663 }
3664 
3665 static void bpf_iter_fini_tcp(void *priv_data)
3666 {
3667 	struct bpf_tcp_iter_state *iter = priv_data;
3668 
3669 	bpf_iter_fini_seq_net(priv_data);
3670 	kvfree(iter->batch);
3671 }
3672 
3673 static const struct bpf_iter_seq_info tcp_seq_info = {
3674 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3675 	.init_seq_private	= bpf_iter_init_tcp,
3676 	.fini_seq_private	= bpf_iter_fini_tcp,
3677 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3678 };
3679 
3680 static const struct bpf_func_proto *
3681 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3682 			    const struct bpf_prog *prog)
3683 {
3684 	switch (func_id) {
3685 	case BPF_FUNC_setsockopt:
3686 		return &bpf_sk_setsockopt_proto;
3687 	case BPF_FUNC_getsockopt:
3688 		return &bpf_sk_getsockopt_proto;
3689 	default:
3690 		return NULL;
3691 	}
3692 }
3693 
3694 static struct bpf_iter_reg tcp_reg_info = {
3695 	.target			= "tcp",
3696 	.ctx_arg_info_size	= 1,
3697 	.ctx_arg_info		= {
3698 		{ offsetof(struct bpf_iter__tcp, sk_common),
3699 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3700 	},
3701 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3702 	.seq_info		= &tcp_seq_info,
3703 };
3704 
3705 static void __init bpf_iter_register(void)
3706 {
3707 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3708 	if (bpf_iter_reg_target(&tcp_reg_info))
3709 		pr_warn("Warning: could not register bpf iterator tcp\n");
3710 }
3711 
3712 #endif
3713 
3714 void __init tcp_v4_init(void)
3715 {
3716 	int cpu, res;
3717 
3718 	for_each_possible_cpu(cpu) {
3719 		struct sock *sk;
3720 
3721 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3722 					   IPPROTO_TCP, &init_net);
3723 		if (res)
3724 			panic("Failed to create the TCP control socket.\n");
3725 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3726 
3727 		/* Please enforce IP_DF and IPID==0 for RST and
3728 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3729 		 */
3730 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3731 
3732 		sk->sk_clockid = CLOCK_MONOTONIC;
3733 
3734 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3735 	}
3736 	if (register_pernet_subsys(&tcp_sk_ops))
3737 		panic("Failed to create the TCP control socket.\n");
3738 
3739 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3740 	bpf_iter_register();
3741 #endif
3742 }
3743