xref: /linux/net/ipv4/tcp_ipv4.c (revision af2d6148d2a159e1a0862bce5a2c88c1618a2b27)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/aligned_data.h>
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/inet_ecn.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/secure_seq.h>
74 #include <net/busy_poll.h>
75 #include <net/rstreason.h>
76 
77 #include <linux/inet.h>
78 #include <linux/ipv6.h>
79 #include <linux/stddef.h>
80 #include <linux/proc_fs.h>
81 #include <linux/seq_file.h>
82 #include <linux/inetdevice.h>
83 #include <linux/btf_ids.h>
84 #include <linux/skbuff_ref.h>
85 
86 #include <crypto/hash.h>
87 #include <linux/scatterlist.h>
88 
89 #include <trace/events/tcp.h>
90 
91 #ifdef CONFIG_TCP_MD5SIG
92 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
93 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
94 #endif
95 
96 struct inet_hashinfo tcp_hashinfo;
97 
98 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
99 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
100 };
101 
102 static DEFINE_MUTEX(tcp_exit_batch_mutex);
103 
104 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
105 {
106 	return secure_tcp_seq(ip_hdr(skb)->daddr,
107 			      ip_hdr(skb)->saddr,
108 			      tcp_hdr(skb)->dest,
109 			      tcp_hdr(skb)->source);
110 }
111 
112 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
113 {
114 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
115 }
116 
117 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
118 {
119 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
120 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
121 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
122 	struct tcp_sock *tp = tcp_sk(sk);
123 	int ts_recent_stamp;
124 	u32 reuse_thresh;
125 
126 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
127 		reuse = 0;
128 
129 	if (reuse == 2) {
130 		/* Still does not detect *everything* that goes through
131 		 * lo, since we require a loopback src or dst address
132 		 * or direct binding to 'lo' interface.
133 		 */
134 		bool loopback = false;
135 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
136 			loopback = true;
137 #if IS_ENABLED(CONFIG_IPV6)
138 		if (tw->tw_family == AF_INET6) {
139 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
140 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
141 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
142 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
143 				loopback = true;
144 		} else
145 #endif
146 		{
147 			if (ipv4_is_loopback(tw->tw_daddr) ||
148 			    ipv4_is_loopback(tw->tw_rcv_saddr))
149 				loopback = true;
150 		}
151 		if (!loopback)
152 			reuse = 0;
153 	}
154 
155 	/* With PAWS, it is safe from the viewpoint
156 	   of data integrity. Even without PAWS it is safe provided sequence
157 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
158 
159 	   Actually, the idea is close to VJ's one, only timestamp cache is
160 	   held not per host, but per port pair and TW bucket is used as state
161 	   holder.
162 
163 	   If TW bucket has been already destroyed we fall back to VJ's scheme
164 	   and use initial timestamp retrieved from peer table.
165 	 */
166 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
167 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
168 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
169 	if (ts_recent_stamp &&
170 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
171 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
172 		 * and releasing the bucket lock.
173 		 */
174 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
175 			return 0;
176 
177 		/* In case of repair and re-using TIME-WAIT sockets we still
178 		 * want to be sure that it is safe as above but honor the
179 		 * sequence numbers and time stamps set as part of the repair
180 		 * process.
181 		 *
182 		 * Without this check re-using a TIME-WAIT socket with TCP
183 		 * repair would accumulate a -1 on the repair assigned
184 		 * sequence number. The first time it is reused the sequence
185 		 * is -1, the second time -2, etc. This fixes that issue
186 		 * without appearing to create any others.
187 		 */
188 		if (likely(!tp->repair)) {
189 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
190 
191 			if (!seq)
192 				seq = 1;
193 			WRITE_ONCE(tp->write_seq, seq);
194 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
195 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
196 		}
197 
198 		return 1;
199 	}
200 
201 	return 0;
202 }
203 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
204 
205 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
206 			      int addr_len)
207 {
208 	/* This check is replicated from tcp_v4_connect() and intended to
209 	 * prevent BPF program called below from accessing bytes that are out
210 	 * of the bound specified by user in addr_len.
211 	 */
212 	if (addr_len < sizeof(struct sockaddr_in))
213 		return -EINVAL;
214 
215 	sock_owned_by_me(sk);
216 
217 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
218 }
219 
220 /* This will initiate an outgoing connection. */
221 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
222 {
223 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
224 	struct inet_timewait_death_row *tcp_death_row;
225 	struct inet_sock *inet = inet_sk(sk);
226 	struct tcp_sock *tp = tcp_sk(sk);
227 	struct ip_options_rcu *inet_opt;
228 	struct net *net = sock_net(sk);
229 	__be16 orig_sport, orig_dport;
230 	__be32 daddr, nexthop;
231 	struct flowi4 *fl4;
232 	struct rtable *rt;
233 	int err;
234 
235 	if (addr_len < sizeof(struct sockaddr_in))
236 		return -EINVAL;
237 
238 	if (usin->sin_family != AF_INET)
239 		return -EAFNOSUPPORT;
240 
241 	nexthop = daddr = usin->sin_addr.s_addr;
242 	inet_opt = rcu_dereference_protected(inet->inet_opt,
243 					     lockdep_sock_is_held(sk));
244 	if (inet_opt && inet_opt->opt.srr) {
245 		if (!daddr)
246 			return -EINVAL;
247 		nexthop = inet_opt->opt.faddr;
248 	}
249 
250 	orig_sport = inet->inet_sport;
251 	orig_dport = usin->sin_port;
252 	fl4 = &inet->cork.fl.u.ip4;
253 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
254 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
255 			      orig_dport, sk);
256 	if (IS_ERR(rt)) {
257 		err = PTR_ERR(rt);
258 		if (err == -ENETUNREACH)
259 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
260 		return err;
261 	}
262 
263 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
264 		ip_rt_put(rt);
265 		return -ENETUNREACH;
266 	}
267 
268 	if (!inet_opt || !inet_opt->opt.srr)
269 		daddr = fl4->daddr;
270 
271 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
272 
273 	if (!inet->inet_saddr) {
274 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
275 		if (err) {
276 			ip_rt_put(rt);
277 			return err;
278 		}
279 	} else {
280 		sk_rcv_saddr_set(sk, inet->inet_saddr);
281 	}
282 
283 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
284 		/* Reset inherited state */
285 		tp->rx_opt.ts_recent	   = 0;
286 		tp->rx_opt.ts_recent_stamp = 0;
287 		if (likely(!tp->repair))
288 			WRITE_ONCE(tp->write_seq, 0);
289 	}
290 
291 	inet->inet_dport = usin->sin_port;
292 	sk_daddr_set(sk, daddr);
293 
294 	inet_csk(sk)->icsk_ext_hdr_len = 0;
295 	if (inet_opt)
296 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
297 
298 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
299 
300 	/* Socket identity is still unknown (sport may be zero).
301 	 * However we set state to SYN-SENT and not releasing socket
302 	 * lock select source port, enter ourselves into the hash tables and
303 	 * complete initialization after this.
304 	 */
305 	tcp_set_state(sk, TCP_SYN_SENT);
306 	err = inet_hash_connect(tcp_death_row, sk);
307 	if (err)
308 		goto failure;
309 
310 	sk_set_txhash(sk);
311 
312 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
313 			       inet->inet_sport, inet->inet_dport, sk);
314 	if (IS_ERR(rt)) {
315 		err = PTR_ERR(rt);
316 		rt = NULL;
317 		goto failure;
318 	}
319 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
320 	/* OK, now commit destination to socket.  */
321 	sk->sk_gso_type = SKB_GSO_TCPV4;
322 	sk_setup_caps(sk, &rt->dst);
323 	rt = NULL;
324 
325 	if (likely(!tp->repair)) {
326 		if (!tp->write_seq)
327 			WRITE_ONCE(tp->write_seq,
328 				   secure_tcp_seq(inet->inet_saddr,
329 						  inet->inet_daddr,
330 						  inet->inet_sport,
331 						  usin->sin_port));
332 		WRITE_ONCE(tp->tsoffset,
333 			   secure_tcp_ts_off(net, inet->inet_saddr,
334 					     inet->inet_daddr));
335 	}
336 
337 	atomic_set(&inet->inet_id, get_random_u16());
338 
339 	if (tcp_fastopen_defer_connect(sk, &err))
340 		return err;
341 	if (err)
342 		goto failure;
343 
344 	err = tcp_connect(sk);
345 
346 	if (err)
347 		goto failure;
348 
349 	return 0;
350 
351 failure:
352 	/*
353 	 * This unhashes the socket and releases the local port,
354 	 * if necessary.
355 	 */
356 	tcp_set_state(sk, TCP_CLOSE);
357 	inet_bhash2_reset_saddr(sk);
358 	ip_rt_put(rt);
359 	sk->sk_route_caps = 0;
360 	inet->inet_dport = 0;
361 	return err;
362 }
363 EXPORT_IPV6_MOD(tcp_v4_connect);
364 
365 /*
366  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
367  * It can be called through tcp_release_cb() if socket was owned by user
368  * at the time tcp_v4_err() was called to handle ICMP message.
369  */
370 void tcp_v4_mtu_reduced(struct sock *sk)
371 {
372 	struct inet_sock *inet = inet_sk(sk);
373 	struct dst_entry *dst;
374 	u32 mtu;
375 
376 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
377 		return;
378 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
379 	dst = inet_csk_update_pmtu(sk, mtu);
380 	if (!dst)
381 		return;
382 
383 	/* Something is about to be wrong... Remember soft error
384 	 * for the case, if this connection will not able to recover.
385 	 */
386 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
387 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
388 
389 	mtu = dst_mtu(dst);
390 
391 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
392 	    ip_sk_accept_pmtu(sk) &&
393 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
394 		tcp_sync_mss(sk, mtu);
395 
396 		/* Resend the TCP packet because it's
397 		 * clear that the old packet has been
398 		 * dropped. This is the new "fast" path mtu
399 		 * discovery.
400 		 */
401 		tcp_simple_retransmit(sk);
402 	} /* else let the usual retransmit timer handle it */
403 }
404 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
405 
406 static void do_redirect(struct sk_buff *skb, struct sock *sk)
407 {
408 	struct dst_entry *dst = __sk_dst_check(sk, 0);
409 
410 	if (dst)
411 		dst->ops->redirect(dst, sk, skb);
412 }
413 
414 
415 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
416 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
417 {
418 	struct request_sock *req = inet_reqsk(sk);
419 	struct net *net = sock_net(sk);
420 
421 	/* ICMPs are not backlogged, hence we cannot get
422 	 * an established socket here.
423 	 */
424 	if (seq != tcp_rsk(req)->snt_isn) {
425 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
426 	} else if (abort) {
427 		/*
428 		 * Still in SYN_RECV, just remove it silently.
429 		 * There is no good way to pass the error to the newly
430 		 * created socket, and POSIX does not want network
431 		 * errors returned from accept().
432 		 */
433 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
434 		tcp_listendrop(req->rsk_listener);
435 	}
436 	reqsk_put(req);
437 }
438 EXPORT_IPV6_MOD(tcp_req_err);
439 
440 /* TCP-LD (RFC 6069) logic */
441 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
442 {
443 	struct inet_connection_sock *icsk = inet_csk(sk);
444 	struct tcp_sock *tp = tcp_sk(sk);
445 	struct sk_buff *skb;
446 	s32 remaining;
447 	u32 delta_us;
448 
449 	if (sock_owned_by_user(sk))
450 		return;
451 
452 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
453 	    !icsk->icsk_backoff)
454 		return;
455 
456 	skb = tcp_rtx_queue_head(sk);
457 	if (WARN_ON_ONCE(!skb))
458 		return;
459 
460 	icsk->icsk_backoff--;
461 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
462 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
463 
464 	tcp_mstamp_refresh(tp);
465 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
466 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
467 
468 	if (remaining > 0) {
469 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
470 	} else {
471 		/* RTO revert clocked out retransmission.
472 		 * Will retransmit now.
473 		 */
474 		tcp_retransmit_timer(sk);
475 	}
476 }
477 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
478 
479 /*
480  * This routine is called by the ICMP module when it gets some
481  * sort of error condition.  If err < 0 then the socket should
482  * be closed and the error returned to the user.  If err > 0
483  * it's just the icmp type << 8 | icmp code.  After adjustment
484  * header points to the first 8 bytes of the tcp header.  We need
485  * to find the appropriate port.
486  *
487  * The locking strategy used here is very "optimistic". When
488  * someone else accesses the socket the ICMP is just dropped
489  * and for some paths there is no check at all.
490  * A more general error queue to queue errors for later handling
491  * is probably better.
492  *
493  */
494 
495 int tcp_v4_err(struct sk_buff *skb, u32 info)
496 {
497 	const struct iphdr *iph = (const struct iphdr *)skb->data;
498 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
499 	struct net *net = dev_net_rcu(skb->dev);
500 	const int type = icmp_hdr(skb)->type;
501 	const int code = icmp_hdr(skb)->code;
502 	struct request_sock *fastopen;
503 	struct tcp_sock *tp;
504 	u32 seq, snd_una;
505 	struct sock *sk;
506 	int err;
507 
508 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
509 				       iph->daddr, th->dest, iph->saddr,
510 				       ntohs(th->source), inet_iif(skb), 0);
511 	if (!sk) {
512 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
513 		return -ENOENT;
514 	}
515 	if (sk->sk_state == TCP_TIME_WAIT) {
516 		/* To increase the counter of ignored icmps for TCP-AO */
517 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
518 		inet_twsk_put(inet_twsk(sk));
519 		return 0;
520 	}
521 	seq = ntohl(th->seq);
522 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
523 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
524 				     type == ICMP_TIME_EXCEEDED ||
525 				     (type == ICMP_DEST_UNREACH &&
526 				      (code == ICMP_NET_UNREACH ||
527 				       code == ICMP_HOST_UNREACH)));
528 		return 0;
529 	}
530 
531 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
532 		sock_put(sk);
533 		return 0;
534 	}
535 
536 	bh_lock_sock(sk);
537 	/* If too many ICMPs get dropped on busy
538 	 * servers this needs to be solved differently.
539 	 * We do take care of PMTU discovery (RFC1191) special case :
540 	 * we can receive locally generated ICMP messages while socket is held.
541 	 */
542 	if (sock_owned_by_user(sk)) {
543 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
544 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
545 	}
546 	if (sk->sk_state == TCP_CLOSE)
547 		goto out;
548 
549 	if (static_branch_unlikely(&ip4_min_ttl)) {
550 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
551 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
552 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
553 			goto out;
554 		}
555 	}
556 
557 	tp = tcp_sk(sk);
558 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
559 	fastopen = rcu_dereference(tp->fastopen_rsk);
560 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
561 	if (sk->sk_state != TCP_LISTEN &&
562 	    !between(seq, snd_una, tp->snd_nxt)) {
563 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
564 		goto out;
565 	}
566 
567 	switch (type) {
568 	case ICMP_REDIRECT:
569 		if (!sock_owned_by_user(sk))
570 			do_redirect(skb, sk);
571 		goto out;
572 	case ICMP_SOURCE_QUENCH:
573 		/* Just silently ignore these. */
574 		goto out;
575 	case ICMP_PARAMETERPROB:
576 		err = EPROTO;
577 		break;
578 	case ICMP_DEST_UNREACH:
579 		if (code > NR_ICMP_UNREACH)
580 			goto out;
581 
582 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
583 			/* We are not interested in TCP_LISTEN and open_requests
584 			 * (SYN-ACKs send out by Linux are always <576bytes so
585 			 * they should go through unfragmented).
586 			 */
587 			if (sk->sk_state == TCP_LISTEN)
588 				goto out;
589 
590 			WRITE_ONCE(tp->mtu_info, info);
591 			if (!sock_owned_by_user(sk)) {
592 				tcp_v4_mtu_reduced(sk);
593 			} else {
594 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
595 					sock_hold(sk);
596 			}
597 			goto out;
598 		}
599 
600 		err = icmp_err_convert[code].errno;
601 		/* check if this ICMP message allows revert of backoff.
602 		 * (see RFC 6069)
603 		 */
604 		if (!fastopen &&
605 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
606 			tcp_ld_RTO_revert(sk, seq);
607 		break;
608 	case ICMP_TIME_EXCEEDED:
609 		err = EHOSTUNREACH;
610 		break;
611 	default:
612 		goto out;
613 	}
614 
615 	switch (sk->sk_state) {
616 	case TCP_SYN_SENT:
617 	case TCP_SYN_RECV:
618 		/* Only in fast or simultaneous open. If a fast open socket is
619 		 * already accepted it is treated as a connected one below.
620 		 */
621 		if (fastopen && !fastopen->sk)
622 			break;
623 
624 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
625 
626 		if (!sock_owned_by_user(sk))
627 			tcp_done_with_error(sk, err);
628 		else
629 			WRITE_ONCE(sk->sk_err_soft, err);
630 		goto out;
631 	}
632 
633 	/* If we've already connected we will keep trying
634 	 * until we time out, or the user gives up.
635 	 *
636 	 * rfc1122 4.2.3.9 allows to consider as hard errors
637 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
638 	 * but it is obsoleted by pmtu discovery).
639 	 *
640 	 * Note, that in modern internet, where routing is unreliable
641 	 * and in each dark corner broken firewalls sit, sending random
642 	 * errors ordered by their masters even this two messages finally lose
643 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
644 	 *
645 	 * Now we are in compliance with RFCs.
646 	 *							--ANK (980905)
647 	 */
648 
649 	if (!sock_owned_by_user(sk) &&
650 	    inet_test_bit(RECVERR, sk)) {
651 		WRITE_ONCE(sk->sk_err, err);
652 		sk_error_report(sk);
653 	} else	{ /* Only an error on timeout */
654 		WRITE_ONCE(sk->sk_err_soft, err);
655 	}
656 
657 out:
658 	bh_unlock_sock(sk);
659 	sock_put(sk);
660 	return 0;
661 }
662 
663 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
664 {
665 	struct tcphdr *th = tcp_hdr(skb);
666 
667 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
668 	skb->csum_start = skb_transport_header(skb) - skb->head;
669 	skb->csum_offset = offsetof(struct tcphdr, check);
670 }
671 
672 /* This routine computes an IPv4 TCP checksum. */
673 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
674 {
675 	const struct inet_sock *inet = inet_sk(sk);
676 
677 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
678 }
679 EXPORT_IPV6_MOD(tcp_v4_send_check);
680 
681 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
682 
683 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
684 				 const struct tcp_ao_hdr *aoh,
685 				 struct ip_reply_arg *arg, struct tcphdr *reply,
686 				 __be32 reply_options[REPLY_OPTIONS_LEN])
687 {
688 #ifdef CONFIG_TCP_AO
689 	int sdif = tcp_v4_sdif(skb);
690 	int dif = inet_iif(skb);
691 	int l3index = sdif ? dif : 0;
692 	bool allocated_traffic_key;
693 	struct tcp_ao_key *key;
694 	char *traffic_key;
695 	bool drop = true;
696 	u32 ao_sne = 0;
697 	u8 keyid;
698 
699 	rcu_read_lock();
700 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
701 				 &key, &traffic_key, &allocated_traffic_key,
702 				 &keyid, &ao_sne))
703 		goto out;
704 
705 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
706 				 (aoh->rnext_keyid << 8) | keyid);
707 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
708 	reply->doff = arg->iov[0].iov_len / 4;
709 
710 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
711 			    key, traffic_key,
712 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
713 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
714 			    reply, ao_sne))
715 		goto out;
716 	drop = false;
717 out:
718 	rcu_read_unlock();
719 	if (allocated_traffic_key)
720 		kfree(traffic_key);
721 	return drop;
722 #else
723 	return true;
724 #endif
725 }
726 
727 /*
728  *	This routine will send an RST to the other tcp.
729  *
730  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
731  *		      for reset.
732  *	Answer: if a packet caused RST, it is not for a socket
733  *		existing in our system, if it is matched to a socket,
734  *		it is just duplicate segment or bug in other side's TCP.
735  *		So that we build reply only basing on parameters
736  *		arrived with segment.
737  *	Exception: precedence violation. We do not implement it in any case.
738  */
739 
740 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
741 			      enum sk_rst_reason reason)
742 {
743 	const struct tcphdr *th = tcp_hdr(skb);
744 	struct {
745 		struct tcphdr th;
746 		__be32 opt[REPLY_OPTIONS_LEN];
747 	} rep;
748 	const __u8 *md5_hash_location = NULL;
749 	const struct tcp_ao_hdr *aoh;
750 	struct ip_reply_arg arg;
751 #ifdef CONFIG_TCP_MD5SIG
752 	struct tcp_md5sig_key *key = NULL;
753 	unsigned char newhash[16];
754 	struct sock *sk1 = NULL;
755 	int genhash;
756 #endif
757 	u64 transmit_time = 0;
758 	struct sock *ctl_sk;
759 	struct net *net;
760 	u32 txhash = 0;
761 
762 	/* Never send a reset in response to a reset. */
763 	if (th->rst)
764 		return;
765 
766 	/* If sk not NULL, it means we did a successful lookup and incoming
767 	 * route had to be correct. prequeue might have dropped our dst.
768 	 */
769 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
770 		return;
771 
772 	/* Swap the send and the receive. */
773 	memset(&rep, 0, sizeof(rep));
774 	rep.th.dest   = th->source;
775 	rep.th.source = th->dest;
776 	rep.th.doff   = sizeof(struct tcphdr) / 4;
777 	rep.th.rst    = 1;
778 
779 	if (th->ack) {
780 		rep.th.seq = th->ack_seq;
781 	} else {
782 		rep.th.ack = 1;
783 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
784 				       skb->len - (th->doff << 2));
785 	}
786 
787 	memset(&arg, 0, sizeof(arg));
788 	arg.iov[0].iov_base = (unsigned char *)&rep;
789 	arg.iov[0].iov_len  = sizeof(rep.th);
790 
791 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
792 
793 	/* Invalid TCP option size or twice included auth */
794 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
795 		return;
796 
797 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
798 		return;
799 
800 #ifdef CONFIG_TCP_MD5SIG
801 	rcu_read_lock();
802 	if (sk && sk_fullsock(sk)) {
803 		const union tcp_md5_addr *addr;
804 		int l3index;
805 
806 		/* sdif set, means packet ingressed via a device
807 		 * in an L3 domain and inet_iif is set to it.
808 		 */
809 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
810 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
811 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
812 	} else if (md5_hash_location) {
813 		const union tcp_md5_addr *addr;
814 		int sdif = tcp_v4_sdif(skb);
815 		int dif = inet_iif(skb);
816 		int l3index;
817 
818 		/*
819 		 * active side is lost. Try to find listening socket through
820 		 * source port, and then find md5 key through listening socket.
821 		 * we are not loose security here:
822 		 * Incoming packet is checked with md5 hash with finding key,
823 		 * no RST generated if md5 hash doesn't match.
824 		 */
825 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
826 					     NULL, 0, ip_hdr(skb)->saddr,
827 					     th->source, ip_hdr(skb)->daddr,
828 					     ntohs(th->source), dif, sdif);
829 		/* don't send rst if it can't find key */
830 		if (!sk1)
831 			goto out;
832 
833 		/* sdif set, means packet ingressed via a device
834 		 * in an L3 domain and dif is set to it.
835 		 */
836 		l3index = sdif ? dif : 0;
837 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
838 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
839 		if (!key)
840 			goto out;
841 
842 
843 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
844 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
845 			goto out;
846 
847 	}
848 
849 	if (key) {
850 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
851 				   (TCPOPT_NOP << 16) |
852 				   (TCPOPT_MD5SIG << 8) |
853 				   TCPOLEN_MD5SIG);
854 		/* Update length and the length the header thinks exists */
855 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
856 		rep.th.doff = arg.iov[0].iov_len / 4;
857 
858 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
859 				     key, ip_hdr(skb)->saddr,
860 				     ip_hdr(skb)->daddr, &rep.th);
861 	}
862 #endif
863 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
864 	if (rep.opt[0] == 0) {
865 		__be32 mrst = mptcp_reset_option(skb);
866 
867 		if (mrst) {
868 			rep.opt[0] = mrst;
869 			arg.iov[0].iov_len += sizeof(mrst);
870 			rep.th.doff = arg.iov[0].iov_len / 4;
871 		}
872 	}
873 
874 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
875 				      ip_hdr(skb)->saddr, /* XXX */
876 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
877 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
878 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
879 
880 	/* When socket is gone, all binding information is lost.
881 	 * routing might fail in this case. No choice here, if we choose to force
882 	 * input interface, we will misroute in case of asymmetric route.
883 	 */
884 	if (sk)
885 		arg.bound_dev_if = sk->sk_bound_dev_if;
886 
887 	trace_tcp_send_reset(sk, skb, reason);
888 
889 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
890 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
891 
892 	/* ECN bits of TW reset are cleared */
893 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
894 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
895 	local_bh_disable();
896 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
897 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
898 
899 	sock_net_set(ctl_sk, net);
900 	if (sk) {
901 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
902 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
903 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
904 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
905 		transmit_time = tcp_transmit_time(sk);
906 		xfrm_sk_clone_policy(ctl_sk, sk);
907 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
908 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
909 	} else {
910 		ctl_sk->sk_mark = 0;
911 		ctl_sk->sk_priority = 0;
912 	}
913 	ip_send_unicast_reply(ctl_sk, sk,
914 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
915 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
916 			      &arg, arg.iov[0].iov_len,
917 			      transmit_time, txhash);
918 
919 	xfrm_sk_free_policy(ctl_sk);
920 	sock_net_set(ctl_sk, &init_net);
921 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
922 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
923 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
924 	local_bh_enable();
925 
926 #ifdef CONFIG_TCP_MD5SIG
927 out:
928 	rcu_read_unlock();
929 #endif
930 }
931 
932 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
933    outside socket context is ugly, certainly. What can I do?
934  */
935 
936 static void tcp_v4_send_ack(const struct sock *sk,
937 			    struct sk_buff *skb, u32 seq, u32 ack,
938 			    u32 win, u32 tsval, u32 tsecr, int oif,
939 			    struct tcp_key *key,
940 			    int reply_flags, u8 tos, u32 txhash)
941 {
942 	const struct tcphdr *th = tcp_hdr(skb);
943 	struct {
944 		struct tcphdr th;
945 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
946 	} rep;
947 	struct net *net = sock_net(sk);
948 	struct ip_reply_arg arg;
949 	struct sock *ctl_sk;
950 	u64 transmit_time;
951 
952 	memset(&rep.th, 0, sizeof(struct tcphdr));
953 	memset(&arg, 0, sizeof(arg));
954 
955 	arg.iov[0].iov_base = (unsigned char *)&rep;
956 	arg.iov[0].iov_len  = sizeof(rep.th);
957 	if (tsecr) {
958 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
959 				   (TCPOPT_TIMESTAMP << 8) |
960 				   TCPOLEN_TIMESTAMP);
961 		rep.opt[1] = htonl(tsval);
962 		rep.opt[2] = htonl(tsecr);
963 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
964 	}
965 
966 	/* Swap the send and the receive. */
967 	rep.th.dest    = th->source;
968 	rep.th.source  = th->dest;
969 	rep.th.doff    = arg.iov[0].iov_len / 4;
970 	rep.th.seq     = htonl(seq);
971 	rep.th.ack_seq = htonl(ack);
972 	rep.th.ack     = 1;
973 	rep.th.window  = htons(win);
974 
975 #ifdef CONFIG_TCP_MD5SIG
976 	if (tcp_key_is_md5(key)) {
977 		int offset = (tsecr) ? 3 : 0;
978 
979 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
980 					  (TCPOPT_NOP << 16) |
981 					  (TCPOPT_MD5SIG << 8) |
982 					  TCPOLEN_MD5SIG);
983 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
984 		rep.th.doff = arg.iov[0].iov_len/4;
985 
986 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
987 				    key->md5_key, ip_hdr(skb)->saddr,
988 				    ip_hdr(skb)->daddr, &rep.th);
989 	}
990 #endif
991 #ifdef CONFIG_TCP_AO
992 	if (tcp_key_is_ao(key)) {
993 		int offset = (tsecr) ? 3 : 0;
994 
995 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
996 					  (tcp_ao_len(key->ao_key) << 16) |
997 					  (key->ao_key->sndid << 8) |
998 					  key->rcv_next);
999 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
1000 		rep.th.doff = arg.iov[0].iov_len / 4;
1001 
1002 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1003 				key->ao_key, key->traffic_key,
1004 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1005 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1006 				&rep.th, key->sne);
1007 	}
1008 #endif
1009 	arg.flags = reply_flags;
1010 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1011 				      ip_hdr(skb)->saddr, /* XXX */
1012 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1013 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1014 	if (oif)
1015 		arg.bound_dev_if = oif;
1016 	arg.tos = tos;
1017 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1018 	local_bh_disable();
1019 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1020 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1021 	sock_net_set(ctl_sk, net);
1022 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1023 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1024 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1025 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1026 	transmit_time = tcp_transmit_time(sk);
1027 	ip_send_unicast_reply(ctl_sk, sk,
1028 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1029 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1030 			      &arg, arg.iov[0].iov_len,
1031 			      transmit_time, txhash);
1032 
1033 	sock_net_set(ctl_sk, &init_net);
1034 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1035 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1036 	local_bh_enable();
1037 }
1038 
1039 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1040 				enum tcp_tw_status tw_status)
1041 {
1042 	struct inet_timewait_sock *tw = inet_twsk(sk);
1043 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1044 	struct tcp_key key = {};
1045 	u8 tos = tw->tw_tos;
1046 
1047 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1048 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1049 	 * being placed in a different service queues (Classic rather than L4S)
1050 	 */
1051 	if (tw_status == TCP_TW_ACK_OOW)
1052 		tos &= ~INET_ECN_MASK;
1053 
1054 #ifdef CONFIG_TCP_AO
1055 	struct tcp_ao_info *ao_info;
1056 
1057 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1058 		/* FIXME: the segment to-be-acked is not verified yet */
1059 		ao_info = rcu_dereference(tcptw->ao_info);
1060 		if (ao_info) {
1061 			const struct tcp_ao_hdr *aoh;
1062 
1063 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1064 				inet_twsk_put(tw);
1065 				return;
1066 			}
1067 
1068 			if (aoh)
1069 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1070 								    aoh->rnext_keyid, -1);
1071 		}
1072 	}
1073 	if (key.ao_key) {
1074 		struct tcp_ao_key *rnext_key;
1075 
1076 		key.traffic_key = snd_other_key(key.ao_key);
1077 		key.sne = READ_ONCE(ao_info->snd_sne);
1078 		rnext_key = READ_ONCE(ao_info->rnext_key);
1079 		key.rcv_next = rnext_key->rcvid;
1080 		key.type = TCP_KEY_AO;
1081 #else
1082 	if (0) {
1083 #endif
1084 	} else if (static_branch_tcp_md5()) {
1085 		key.md5_key = tcp_twsk_md5_key(tcptw);
1086 		if (key.md5_key)
1087 			key.type = TCP_KEY_MD5;
1088 	}
1089 
1090 	tcp_v4_send_ack(sk, skb,
1091 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1092 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1093 			tcp_tw_tsval(tcptw),
1094 			READ_ONCE(tcptw->tw_ts_recent),
1095 			tw->tw_bound_dev_if, &key,
1096 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1097 			tos,
1098 			tw->tw_txhash);
1099 
1100 	inet_twsk_put(tw);
1101 }
1102 
1103 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1104 				  struct request_sock *req)
1105 {
1106 	struct tcp_key key = {};
1107 
1108 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1109 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1110 	 */
1111 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1112 					     tcp_sk(sk)->snd_nxt;
1113 
1114 #ifdef CONFIG_TCP_AO
1115 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1116 	    tcp_rsk_used_ao(req)) {
1117 		const union tcp_md5_addr *addr;
1118 		const struct tcp_ao_hdr *aoh;
1119 		int l3index;
1120 
1121 		/* Invalid TCP option size or twice included auth */
1122 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1123 			return;
1124 		if (!aoh)
1125 			return;
1126 
1127 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1128 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1129 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1130 					      aoh->rnext_keyid, -1);
1131 		if (unlikely(!key.ao_key)) {
1132 			/* Send ACK with any matching MKT for the peer */
1133 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1134 			/* Matching key disappeared (user removed the key?)
1135 			 * let the handshake timeout.
1136 			 */
1137 			if (!key.ao_key) {
1138 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1139 						     addr,
1140 						     ntohs(tcp_hdr(skb)->source),
1141 						     &ip_hdr(skb)->daddr,
1142 						     ntohs(tcp_hdr(skb)->dest));
1143 				return;
1144 			}
1145 		}
1146 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1147 		if (!key.traffic_key)
1148 			return;
1149 
1150 		key.type = TCP_KEY_AO;
1151 		key.rcv_next = aoh->keyid;
1152 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1153 #else
1154 	if (0) {
1155 #endif
1156 	} else if (static_branch_tcp_md5()) {
1157 		const union tcp_md5_addr *addr;
1158 		int l3index;
1159 
1160 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1161 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1162 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1163 		if (key.md5_key)
1164 			key.type = TCP_KEY_MD5;
1165 	}
1166 
1167 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1168 	tcp_v4_send_ack(sk, skb, seq,
1169 			tcp_rsk(req)->rcv_nxt,
1170 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1171 			tcp_rsk_tsval(tcp_rsk(req)),
1172 			req->ts_recent,
1173 			0, &key,
1174 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1175 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1176 			READ_ONCE(tcp_rsk(req)->txhash));
1177 	if (tcp_key_is_ao(&key))
1178 		kfree(key.traffic_key);
1179 }
1180 
1181 /*
1182  *	Send a SYN-ACK after having received a SYN.
1183  *	This still operates on a request_sock only, not on a big
1184  *	socket.
1185  */
1186 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1187 			      struct flowi *fl,
1188 			      struct request_sock *req,
1189 			      struct tcp_fastopen_cookie *foc,
1190 			      enum tcp_synack_type synack_type,
1191 			      struct sk_buff *syn_skb)
1192 {
1193 	const struct inet_request_sock *ireq = inet_rsk(req);
1194 	struct flowi4 fl4;
1195 	int err = -1;
1196 	struct sk_buff *skb;
1197 	u8 tos;
1198 
1199 	/* First, grab a route. */
1200 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1201 		return -1;
1202 
1203 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1204 
1205 	if (skb) {
1206 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1207 
1208 		tos = READ_ONCE(inet_sk(sk)->tos);
1209 
1210 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1211 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1212 			      (tos & INET_ECN_MASK);
1213 
1214 		if (!INET_ECN_is_capable(tos) &&
1215 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1216 			tos |= INET_ECN_ECT_0;
1217 
1218 		rcu_read_lock();
1219 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1220 					    ireq->ir_rmt_addr,
1221 					    rcu_dereference(ireq->ireq_opt),
1222 					    tos);
1223 		rcu_read_unlock();
1224 		err = net_xmit_eval(err);
1225 	}
1226 
1227 	return err;
1228 }
1229 
1230 /*
1231  *	IPv4 request_sock destructor.
1232  */
1233 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1234 {
1235 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1236 }
1237 
1238 #ifdef CONFIG_TCP_MD5SIG
1239 /*
1240  * RFC2385 MD5 checksumming requires a mapping of
1241  * IP address->MD5 Key.
1242  * We need to maintain these in the sk structure.
1243  */
1244 
1245 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1246 EXPORT_IPV6_MOD(tcp_md5_needed);
1247 
1248 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1249 {
1250 	if (!old)
1251 		return true;
1252 
1253 	/* l3index always overrides non-l3index */
1254 	if (old->l3index && new->l3index == 0)
1255 		return false;
1256 	if (old->l3index == 0 && new->l3index)
1257 		return true;
1258 
1259 	return old->prefixlen < new->prefixlen;
1260 }
1261 
1262 /* Find the Key structure for an address.  */
1263 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1264 					   const union tcp_md5_addr *addr,
1265 					   int family, bool any_l3index)
1266 {
1267 	const struct tcp_sock *tp = tcp_sk(sk);
1268 	struct tcp_md5sig_key *key;
1269 	const struct tcp_md5sig_info *md5sig;
1270 	__be32 mask;
1271 	struct tcp_md5sig_key *best_match = NULL;
1272 	bool match;
1273 
1274 	/* caller either holds rcu_read_lock() or socket lock */
1275 	md5sig = rcu_dereference_check(tp->md5sig_info,
1276 				       lockdep_sock_is_held(sk));
1277 	if (!md5sig)
1278 		return NULL;
1279 
1280 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1281 				 lockdep_sock_is_held(sk)) {
1282 		if (key->family != family)
1283 			continue;
1284 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1285 		    key->l3index != l3index)
1286 			continue;
1287 		if (family == AF_INET) {
1288 			mask = inet_make_mask(key->prefixlen);
1289 			match = (key->addr.a4.s_addr & mask) ==
1290 				(addr->a4.s_addr & mask);
1291 #if IS_ENABLED(CONFIG_IPV6)
1292 		} else if (family == AF_INET6) {
1293 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1294 						  key->prefixlen);
1295 #endif
1296 		} else {
1297 			match = false;
1298 		}
1299 
1300 		if (match && better_md5_match(best_match, key))
1301 			best_match = key;
1302 	}
1303 	return best_match;
1304 }
1305 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1306 
1307 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1308 						      const union tcp_md5_addr *addr,
1309 						      int family, u8 prefixlen,
1310 						      int l3index, u8 flags)
1311 {
1312 	const struct tcp_sock *tp = tcp_sk(sk);
1313 	struct tcp_md5sig_key *key;
1314 	unsigned int size = sizeof(struct in_addr);
1315 	const struct tcp_md5sig_info *md5sig;
1316 
1317 	/* caller either holds rcu_read_lock() or socket lock */
1318 	md5sig = rcu_dereference_check(tp->md5sig_info,
1319 				       lockdep_sock_is_held(sk));
1320 	if (!md5sig)
1321 		return NULL;
1322 #if IS_ENABLED(CONFIG_IPV6)
1323 	if (family == AF_INET6)
1324 		size = sizeof(struct in6_addr);
1325 #endif
1326 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1327 				 lockdep_sock_is_held(sk)) {
1328 		if (key->family != family)
1329 			continue;
1330 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1331 			continue;
1332 		if (key->l3index != l3index)
1333 			continue;
1334 		if (!memcmp(&key->addr, addr, size) &&
1335 		    key->prefixlen == prefixlen)
1336 			return key;
1337 	}
1338 	return NULL;
1339 }
1340 
1341 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1342 					 const struct sock *addr_sk)
1343 {
1344 	const union tcp_md5_addr *addr;
1345 	int l3index;
1346 
1347 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1348 						 addr_sk->sk_bound_dev_if);
1349 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1350 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1351 }
1352 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1353 
1354 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1355 {
1356 	struct tcp_sock *tp = tcp_sk(sk);
1357 	struct tcp_md5sig_info *md5sig;
1358 
1359 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1360 	if (!md5sig)
1361 		return -ENOMEM;
1362 
1363 	sk_gso_disable(sk);
1364 	INIT_HLIST_HEAD(&md5sig->head);
1365 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1366 	return 0;
1367 }
1368 
1369 /* This can be called on a newly created socket, from other files */
1370 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1371 			    int family, u8 prefixlen, int l3index, u8 flags,
1372 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1373 {
1374 	/* Add Key to the list */
1375 	struct tcp_md5sig_key *key;
1376 	struct tcp_sock *tp = tcp_sk(sk);
1377 	struct tcp_md5sig_info *md5sig;
1378 
1379 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1380 	if (key) {
1381 		/* Pre-existing entry - just update that one.
1382 		 * Note that the key might be used concurrently.
1383 		 * data_race() is telling kcsan that we do not care of
1384 		 * key mismatches, since changing MD5 key on live flows
1385 		 * can lead to packet drops.
1386 		 */
1387 		data_race(memcpy(key->key, newkey, newkeylen));
1388 
1389 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1390 		 * Also note that a reader could catch new key->keylen value
1391 		 * but old key->key[], this is the reason we use __GFP_ZERO
1392 		 * at sock_kmalloc() time below these lines.
1393 		 */
1394 		WRITE_ONCE(key->keylen, newkeylen);
1395 
1396 		return 0;
1397 	}
1398 
1399 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1400 					   lockdep_sock_is_held(sk));
1401 
1402 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1403 	if (!key)
1404 		return -ENOMEM;
1405 
1406 	memcpy(key->key, newkey, newkeylen);
1407 	key->keylen = newkeylen;
1408 	key->family = family;
1409 	key->prefixlen = prefixlen;
1410 	key->l3index = l3index;
1411 	key->flags = flags;
1412 	memcpy(&key->addr, addr,
1413 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1414 								 sizeof(struct in_addr));
1415 	hlist_add_head_rcu(&key->node, &md5sig->head);
1416 	return 0;
1417 }
1418 
1419 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1420 		   int family, u8 prefixlen, int l3index, u8 flags,
1421 		   const u8 *newkey, u8 newkeylen)
1422 {
1423 	struct tcp_sock *tp = tcp_sk(sk);
1424 
1425 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1426 		if (tcp_md5_alloc_sigpool())
1427 			return -ENOMEM;
1428 
1429 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1430 			tcp_md5_release_sigpool();
1431 			return -ENOMEM;
1432 		}
1433 
1434 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1435 			struct tcp_md5sig_info *md5sig;
1436 
1437 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1438 			rcu_assign_pointer(tp->md5sig_info, NULL);
1439 			kfree_rcu(md5sig, rcu);
1440 			tcp_md5_release_sigpool();
1441 			return -EUSERS;
1442 		}
1443 	}
1444 
1445 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1446 				newkey, newkeylen, GFP_KERNEL);
1447 }
1448 EXPORT_IPV6_MOD(tcp_md5_do_add);
1449 
1450 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1451 		     int family, u8 prefixlen, int l3index,
1452 		     struct tcp_md5sig_key *key)
1453 {
1454 	struct tcp_sock *tp = tcp_sk(sk);
1455 
1456 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1457 		tcp_md5_add_sigpool();
1458 
1459 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1460 			tcp_md5_release_sigpool();
1461 			return -ENOMEM;
1462 		}
1463 
1464 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1465 			struct tcp_md5sig_info *md5sig;
1466 
1467 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1468 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1469 			rcu_assign_pointer(tp->md5sig_info, NULL);
1470 			kfree_rcu(md5sig, rcu);
1471 			tcp_md5_release_sigpool();
1472 			return -EUSERS;
1473 		}
1474 	}
1475 
1476 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1477 				key->flags, key->key, key->keylen,
1478 				sk_gfp_mask(sk, GFP_ATOMIC));
1479 }
1480 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1481 
1482 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1483 		   u8 prefixlen, int l3index, u8 flags)
1484 {
1485 	struct tcp_md5sig_key *key;
1486 
1487 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1488 	if (!key)
1489 		return -ENOENT;
1490 	hlist_del_rcu(&key->node);
1491 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1492 	kfree_rcu(key, rcu);
1493 	return 0;
1494 }
1495 EXPORT_IPV6_MOD(tcp_md5_do_del);
1496 
1497 void tcp_clear_md5_list(struct sock *sk)
1498 {
1499 	struct tcp_sock *tp = tcp_sk(sk);
1500 	struct tcp_md5sig_key *key;
1501 	struct hlist_node *n;
1502 	struct tcp_md5sig_info *md5sig;
1503 
1504 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1505 
1506 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1507 		hlist_del_rcu(&key->node);
1508 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1509 		kfree_rcu(key, rcu);
1510 	}
1511 }
1512 
1513 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1514 				 sockptr_t optval, int optlen)
1515 {
1516 	struct tcp_md5sig cmd;
1517 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1518 	const union tcp_md5_addr *addr;
1519 	u8 prefixlen = 32;
1520 	int l3index = 0;
1521 	bool l3flag;
1522 	u8 flags;
1523 
1524 	if (optlen < sizeof(cmd))
1525 		return -EINVAL;
1526 
1527 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1528 		return -EFAULT;
1529 
1530 	if (sin->sin_family != AF_INET)
1531 		return -EINVAL;
1532 
1533 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1534 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1535 
1536 	if (optname == TCP_MD5SIG_EXT &&
1537 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1538 		prefixlen = cmd.tcpm_prefixlen;
1539 		if (prefixlen > 32)
1540 			return -EINVAL;
1541 	}
1542 
1543 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1544 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1545 		struct net_device *dev;
1546 
1547 		rcu_read_lock();
1548 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1549 		if (dev && netif_is_l3_master(dev))
1550 			l3index = dev->ifindex;
1551 
1552 		rcu_read_unlock();
1553 
1554 		/* ok to reference set/not set outside of rcu;
1555 		 * right now device MUST be an L3 master
1556 		 */
1557 		if (!dev || !l3index)
1558 			return -EINVAL;
1559 	}
1560 
1561 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1562 
1563 	if (!cmd.tcpm_keylen)
1564 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1565 
1566 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1567 		return -EINVAL;
1568 
1569 	/* Don't allow keys for peers that have a matching TCP-AO key.
1570 	 * See the comment in tcp_ao_add_cmd()
1571 	 */
1572 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1573 		return -EKEYREJECTED;
1574 
1575 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1576 			      cmd.tcpm_key, cmd.tcpm_keylen);
1577 }
1578 
1579 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1580 				   __be32 daddr, __be32 saddr,
1581 				   const struct tcphdr *th, int nbytes)
1582 {
1583 	struct tcp4_pseudohdr *bp;
1584 	struct scatterlist sg;
1585 	struct tcphdr *_th;
1586 
1587 	bp = hp->scratch;
1588 	bp->saddr = saddr;
1589 	bp->daddr = daddr;
1590 	bp->pad = 0;
1591 	bp->protocol = IPPROTO_TCP;
1592 	bp->len = cpu_to_be16(nbytes);
1593 
1594 	_th = (struct tcphdr *)(bp + 1);
1595 	memcpy(_th, th, sizeof(*th));
1596 	_th->check = 0;
1597 
1598 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1599 	ahash_request_set_crypt(hp->req, &sg, NULL,
1600 				sizeof(*bp) + sizeof(*th));
1601 	return crypto_ahash_update(hp->req);
1602 }
1603 
1604 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1605 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1606 {
1607 	struct tcp_sigpool hp;
1608 
1609 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1610 		goto clear_hash_nostart;
1611 
1612 	if (crypto_ahash_init(hp.req))
1613 		goto clear_hash;
1614 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1615 		goto clear_hash;
1616 	if (tcp_md5_hash_key(&hp, key))
1617 		goto clear_hash;
1618 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1619 	if (crypto_ahash_final(hp.req))
1620 		goto clear_hash;
1621 
1622 	tcp_sigpool_end(&hp);
1623 	return 0;
1624 
1625 clear_hash:
1626 	tcp_sigpool_end(&hp);
1627 clear_hash_nostart:
1628 	memset(md5_hash, 0, 16);
1629 	return 1;
1630 }
1631 
1632 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1633 			const struct sock *sk,
1634 			const struct sk_buff *skb)
1635 {
1636 	const struct tcphdr *th = tcp_hdr(skb);
1637 	struct tcp_sigpool hp;
1638 	__be32 saddr, daddr;
1639 
1640 	if (sk) { /* valid for establish/request sockets */
1641 		saddr = sk->sk_rcv_saddr;
1642 		daddr = sk->sk_daddr;
1643 	} else {
1644 		const struct iphdr *iph = ip_hdr(skb);
1645 		saddr = iph->saddr;
1646 		daddr = iph->daddr;
1647 	}
1648 
1649 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1650 		goto clear_hash_nostart;
1651 
1652 	if (crypto_ahash_init(hp.req))
1653 		goto clear_hash;
1654 
1655 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1656 		goto clear_hash;
1657 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1658 		goto clear_hash;
1659 	if (tcp_md5_hash_key(&hp, key))
1660 		goto clear_hash;
1661 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1662 	if (crypto_ahash_final(hp.req))
1663 		goto clear_hash;
1664 
1665 	tcp_sigpool_end(&hp);
1666 	return 0;
1667 
1668 clear_hash:
1669 	tcp_sigpool_end(&hp);
1670 clear_hash_nostart:
1671 	memset(md5_hash, 0, 16);
1672 	return 1;
1673 }
1674 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1675 
1676 #endif
1677 
1678 static void tcp_v4_init_req(struct request_sock *req,
1679 			    const struct sock *sk_listener,
1680 			    struct sk_buff *skb)
1681 {
1682 	struct inet_request_sock *ireq = inet_rsk(req);
1683 	struct net *net = sock_net(sk_listener);
1684 
1685 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1686 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1687 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1688 }
1689 
1690 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1691 					  struct sk_buff *skb,
1692 					  struct flowi *fl,
1693 					  struct request_sock *req,
1694 					  u32 tw_isn)
1695 {
1696 	tcp_v4_init_req(req, sk, skb);
1697 
1698 	if (security_inet_conn_request(sk, skb, req))
1699 		return NULL;
1700 
1701 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1702 }
1703 
1704 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1705 	.family		=	PF_INET,
1706 	.obj_size	=	sizeof(struct tcp_request_sock),
1707 	.send_ack	=	tcp_v4_reqsk_send_ack,
1708 	.destructor	=	tcp_v4_reqsk_destructor,
1709 	.send_reset	=	tcp_v4_send_reset,
1710 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1711 };
1712 
1713 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1714 	.mss_clamp	=	TCP_MSS_DEFAULT,
1715 #ifdef CONFIG_TCP_MD5SIG
1716 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1717 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1718 #endif
1719 #ifdef CONFIG_TCP_AO
1720 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1721 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1722 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1723 #endif
1724 #ifdef CONFIG_SYN_COOKIES
1725 	.cookie_init_seq =	cookie_v4_init_sequence,
1726 #endif
1727 	.route_req	=	tcp_v4_route_req,
1728 	.init_seq	=	tcp_v4_init_seq,
1729 	.init_ts_off	=	tcp_v4_init_ts_off,
1730 	.send_synack	=	tcp_v4_send_synack,
1731 };
1732 
1733 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1734 {
1735 	/* Never answer to SYNs send to broadcast or multicast */
1736 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1737 		goto drop;
1738 
1739 	return tcp_conn_request(&tcp_request_sock_ops,
1740 				&tcp_request_sock_ipv4_ops, sk, skb);
1741 
1742 drop:
1743 	tcp_listendrop(sk);
1744 	return 0;
1745 }
1746 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1747 
1748 
1749 /*
1750  * The three way handshake has completed - we got a valid synack -
1751  * now create the new socket.
1752  */
1753 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1754 				  struct request_sock *req,
1755 				  struct dst_entry *dst,
1756 				  struct request_sock *req_unhash,
1757 				  bool *own_req)
1758 {
1759 	struct inet_request_sock *ireq;
1760 	bool found_dup_sk = false;
1761 	struct inet_sock *newinet;
1762 	struct tcp_sock *newtp;
1763 	struct sock *newsk;
1764 #ifdef CONFIG_TCP_MD5SIG
1765 	const union tcp_md5_addr *addr;
1766 	struct tcp_md5sig_key *key;
1767 	int l3index;
1768 #endif
1769 	struct ip_options_rcu *inet_opt;
1770 
1771 	if (sk_acceptq_is_full(sk))
1772 		goto exit_overflow;
1773 
1774 	newsk = tcp_create_openreq_child(sk, req, skb);
1775 	if (!newsk)
1776 		goto exit_nonewsk;
1777 
1778 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1779 	inet_sk_rx_dst_set(newsk, skb);
1780 
1781 	newtp		      = tcp_sk(newsk);
1782 	newinet		      = inet_sk(newsk);
1783 	ireq		      = inet_rsk(req);
1784 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1785 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1786 	newinet->mc_index     = inet_iif(skb);
1787 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1788 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1789 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1790 	if (inet_opt)
1791 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1792 	atomic_set(&newinet->inet_id, get_random_u16());
1793 
1794 	/* Set ToS of the new socket based upon the value of incoming SYN.
1795 	 * ECT bits are set later in tcp_init_transfer().
1796 	 */
1797 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1798 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1799 
1800 	if (!dst) {
1801 		dst = inet_csk_route_child_sock(sk, newsk, req);
1802 		if (!dst)
1803 			goto put_and_exit;
1804 	} else {
1805 		/* syncookie case : see end of cookie_v4_check() */
1806 	}
1807 	sk_setup_caps(newsk, dst);
1808 
1809 	tcp_ca_openreq_child(newsk, dst);
1810 
1811 	tcp_sync_mss(newsk, dst_mtu(dst));
1812 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1813 
1814 	tcp_initialize_rcv_mss(newsk);
1815 
1816 #ifdef CONFIG_TCP_MD5SIG
1817 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1818 	/* Copy over the MD5 key from the original socket */
1819 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1820 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1821 	if (key && !tcp_rsk_used_ao(req)) {
1822 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1823 			goto put_and_exit;
1824 		sk_gso_disable(newsk);
1825 	}
1826 #endif
1827 #ifdef CONFIG_TCP_AO
1828 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1829 		goto put_and_exit; /* OOM, release back memory */
1830 #endif
1831 
1832 	if (__inet_inherit_port(sk, newsk) < 0)
1833 		goto put_and_exit;
1834 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1835 				       &found_dup_sk);
1836 	if (likely(*own_req)) {
1837 		tcp_move_syn(newtp, req);
1838 		ireq->ireq_opt = NULL;
1839 	} else {
1840 		newinet->inet_opt = NULL;
1841 
1842 		if (!req_unhash && found_dup_sk) {
1843 			/* This code path should only be executed in the
1844 			 * syncookie case only
1845 			 */
1846 			bh_unlock_sock(newsk);
1847 			sock_put(newsk);
1848 			newsk = NULL;
1849 		}
1850 	}
1851 	return newsk;
1852 
1853 exit_overflow:
1854 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1855 exit_nonewsk:
1856 	dst_release(dst);
1857 exit:
1858 	tcp_listendrop(sk);
1859 	return NULL;
1860 put_and_exit:
1861 	newinet->inet_opt = NULL;
1862 	inet_csk_prepare_forced_close(newsk);
1863 	tcp_done(newsk);
1864 	goto exit;
1865 }
1866 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1867 
1868 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1869 {
1870 #ifdef CONFIG_SYN_COOKIES
1871 	const struct tcphdr *th = tcp_hdr(skb);
1872 
1873 	if (!th->syn)
1874 		sk = cookie_v4_check(sk, skb);
1875 #endif
1876 	return sk;
1877 }
1878 
1879 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1880 			 struct tcphdr *th, u32 *cookie)
1881 {
1882 	u16 mss = 0;
1883 #ifdef CONFIG_SYN_COOKIES
1884 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1885 				    &tcp_request_sock_ipv4_ops, sk, th);
1886 	if (mss) {
1887 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1888 		tcp_synq_overflow(sk);
1889 	}
1890 #endif
1891 	return mss;
1892 }
1893 
1894 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1895 							   u32));
1896 /* The socket must have it's spinlock held when we get
1897  * here, unless it is a TCP_LISTEN socket.
1898  *
1899  * We have a potential double-lock case here, so even when
1900  * doing backlog processing we use the BH locking scheme.
1901  * This is because we cannot sleep with the original spinlock
1902  * held.
1903  */
1904 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1905 {
1906 	enum skb_drop_reason reason;
1907 	struct sock *rsk;
1908 
1909 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1910 		struct dst_entry *dst;
1911 
1912 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1913 						lockdep_sock_is_held(sk));
1914 
1915 		sock_rps_save_rxhash(sk, skb);
1916 		sk_mark_napi_id(sk, skb);
1917 		if (dst) {
1918 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1919 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1920 					     dst, 0)) {
1921 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1922 				dst_release(dst);
1923 			}
1924 		}
1925 		tcp_rcv_established(sk, skb);
1926 		return 0;
1927 	}
1928 
1929 	if (tcp_checksum_complete(skb))
1930 		goto csum_err;
1931 
1932 	if (sk->sk_state == TCP_LISTEN) {
1933 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1934 
1935 		if (!nsk)
1936 			return 0;
1937 		if (nsk != sk) {
1938 			reason = tcp_child_process(sk, nsk, skb);
1939 			if (reason) {
1940 				rsk = nsk;
1941 				goto reset;
1942 			}
1943 			return 0;
1944 		}
1945 	} else
1946 		sock_rps_save_rxhash(sk, skb);
1947 
1948 	reason = tcp_rcv_state_process(sk, skb);
1949 	if (reason) {
1950 		rsk = sk;
1951 		goto reset;
1952 	}
1953 	return 0;
1954 
1955 reset:
1956 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1957 discard:
1958 	sk_skb_reason_drop(sk, skb, reason);
1959 	/* Be careful here. If this function gets more complicated and
1960 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1961 	 * might be destroyed here. This current version compiles correctly,
1962 	 * but you have been warned.
1963 	 */
1964 	return 0;
1965 
1966 csum_err:
1967 	reason = SKB_DROP_REASON_TCP_CSUM;
1968 	trace_tcp_bad_csum(skb);
1969 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1970 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1971 	goto discard;
1972 }
1973 EXPORT_SYMBOL(tcp_v4_do_rcv);
1974 
1975 int tcp_v4_early_demux(struct sk_buff *skb)
1976 {
1977 	struct net *net = dev_net_rcu(skb->dev);
1978 	const struct iphdr *iph;
1979 	const struct tcphdr *th;
1980 	struct sock *sk;
1981 
1982 	if (skb->pkt_type != PACKET_HOST)
1983 		return 0;
1984 
1985 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1986 		return 0;
1987 
1988 	iph = ip_hdr(skb);
1989 	th = tcp_hdr(skb);
1990 
1991 	if (th->doff < sizeof(struct tcphdr) / 4)
1992 		return 0;
1993 
1994 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1995 				       iph->saddr, th->source,
1996 				       iph->daddr, ntohs(th->dest),
1997 				       skb->skb_iif, inet_sdif(skb));
1998 	if (sk) {
1999 		skb->sk = sk;
2000 		skb->destructor = sock_edemux;
2001 		if (sk_fullsock(sk)) {
2002 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2003 
2004 			if (dst)
2005 				dst = dst_check(dst, 0);
2006 			if (dst &&
2007 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
2008 				skb_dst_set_noref(skb, dst);
2009 		}
2010 	}
2011 	return 0;
2012 }
2013 
2014 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2015 		     enum skb_drop_reason *reason)
2016 {
2017 	u32 tail_gso_size, tail_gso_segs;
2018 	struct skb_shared_info *shinfo;
2019 	const struct tcphdr *th;
2020 	struct tcphdr *thtail;
2021 	struct sk_buff *tail;
2022 	unsigned int hdrlen;
2023 	bool fragstolen;
2024 	u32 gso_segs;
2025 	u32 gso_size;
2026 	u64 limit;
2027 	int delta;
2028 
2029 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2030 	 * we can fix skb->truesize to its real value to avoid future drops.
2031 	 * This is valid because skb is not yet charged to the socket.
2032 	 * It has been noticed pure SACK packets were sometimes dropped
2033 	 * (if cooked by drivers without copybreak feature).
2034 	 */
2035 	skb_condense(skb);
2036 
2037 	tcp_cleanup_skb(skb);
2038 
2039 	if (unlikely(tcp_checksum_complete(skb))) {
2040 		bh_unlock_sock(sk);
2041 		trace_tcp_bad_csum(skb);
2042 		*reason = SKB_DROP_REASON_TCP_CSUM;
2043 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2044 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2045 		return true;
2046 	}
2047 
2048 	/* Attempt coalescing to last skb in backlog, even if we are
2049 	 * above the limits.
2050 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2051 	 */
2052 	th = (const struct tcphdr *)skb->data;
2053 	hdrlen = th->doff * 4;
2054 
2055 	tail = sk->sk_backlog.tail;
2056 	if (!tail)
2057 		goto no_coalesce;
2058 	thtail = (struct tcphdr *)tail->data;
2059 
2060 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2061 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2062 	    ((TCP_SKB_CB(tail)->tcp_flags |
2063 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2064 	    !((TCP_SKB_CB(tail)->tcp_flags &
2065 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2066 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2067 	      TCP_SKB_CB(skb)->tcp_flags) &
2068 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2069 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2070 	    thtail->doff != th->doff ||
2071 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2072 		goto no_coalesce;
2073 
2074 	__skb_pull(skb, hdrlen);
2075 
2076 	shinfo = skb_shinfo(skb);
2077 	gso_size = shinfo->gso_size ?: skb->len;
2078 	gso_segs = shinfo->gso_segs ?: 1;
2079 
2080 	shinfo = skb_shinfo(tail);
2081 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2082 	tail_gso_segs = shinfo->gso_segs ?: 1;
2083 
2084 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2085 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2086 
2087 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2088 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2089 			thtail->window = th->window;
2090 		}
2091 
2092 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2093 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2094 		 * is not entered if we append a packet with a FIN.
2095 		 * SYN, RST, URG are not present.
2096 		 * ACK is set on both packets.
2097 		 * PSH : we do not really care in TCP stack,
2098 		 *       at least for 'GRO' packets.
2099 		 */
2100 		thtail->fin |= th->fin;
2101 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2102 
2103 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2104 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2105 			tail->tstamp = skb->tstamp;
2106 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2107 		}
2108 
2109 		/* Not as strict as GRO. We only need to carry mss max value */
2110 		shinfo->gso_size = max(gso_size, tail_gso_size);
2111 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2112 
2113 		sk->sk_backlog.len += delta;
2114 		__NET_INC_STATS(sock_net(sk),
2115 				LINUX_MIB_TCPBACKLOGCOALESCE);
2116 		kfree_skb_partial(skb, fragstolen);
2117 		return false;
2118 	}
2119 	__skb_push(skb, hdrlen);
2120 
2121 no_coalesce:
2122 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2123 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2124 	 * sk_rcvbuf in normal conditions.
2125 	 */
2126 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2127 
2128 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2129 
2130 	/* Only socket owner can try to collapse/prune rx queues
2131 	 * to reduce memory overhead, so add a little headroom here.
2132 	 * Few sockets backlog are possibly concurrently non empty.
2133 	 */
2134 	limit += 64 * 1024;
2135 
2136 	limit = min_t(u64, limit, UINT_MAX);
2137 
2138 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2139 		bh_unlock_sock(sk);
2140 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2141 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2142 		return true;
2143 	}
2144 	return false;
2145 }
2146 EXPORT_IPV6_MOD(tcp_add_backlog);
2147 
2148 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2149 {
2150 	struct tcphdr *th = (struct tcphdr *)skb->data;
2151 
2152 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2153 }
2154 EXPORT_IPV6_MOD(tcp_filter);
2155 
2156 static void tcp_v4_restore_cb(struct sk_buff *skb)
2157 {
2158 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2159 		sizeof(struct inet_skb_parm));
2160 }
2161 
2162 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2163 			   const struct tcphdr *th)
2164 {
2165 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2166 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2167 	 */
2168 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2169 		sizeof(struct inet_skb_parm));
2170 	barrier();
2171 
2172 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2173 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2174 				    skb->len - th->doff * 4);
2175 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2176 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2177 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2178 	TCP_SKB_CB(skb)->sacked	 = 0;
2179 	TCP_SKB_CB(skb)->has_rxtstamp =
2180 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2181 }
2182 
2183 /*
2184  *	From tcp_input.c
2185  */
2186 
2187 int tcp_v4_rcv(struct sk_buff *skb)
2188 {
2189 	struct net *net = dev_net_rcu(skb->dev);
2190 	enum skb_drop_reason drop_reason;
2191 	enum tcp_tw_status tw_status;
2192 	int sdif = inet_sdif(skb);
2193 	int dif = inet_iif(skb);
2194 	const struct iphdr *iph;
2195 	const struct tcphdr *th;
2196 	struct sock *sk = NULL;
2197 	bool refcounted;
2198 	int ret;
2199 	u32 isn;
2200 
2201 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2202 	if (skb->pkt_type != PACKET_HOST)
2203 		goto discard_it;
2204 
2205 	/* Count it even if it's bad */
2206 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2207 
2208 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2209 		goto discard_it;
2210 
2211 	th = (const struct tcphdr *)skb->data;
2212 
2213 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2214 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2215 		goto bad_packet;
2216 	}
2217 	if (!pskb_may_pull(skb, th->doff * 4))
2218 		goto discard_it;
2219 
2220 	/* An explanation is required here, I think.
2221 	 * Packet length and doff are validated by header prediction,
2222 	 * provided case of th->doff==0 is eliminated.
2223 	 * So, we defer the checks. */
2224 
2225 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2226 		goto csum_error;
2227 
2228 	th = (const struct tcphdr *)skb->data;
2229 	iph = ip_hdr(skb);
2230 lookup:
2231 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2232 			       skb, __tcp_hdrlen(th), th->source,
2233 			       th->dest, sdif, &refcounted);
2234 	if (!sk)
2235 		goto no_tcp_socket;
2236 
2237 	if (sk->sk_state == TCP_TIME_WAIT)
2238 		goto do_time_wait;
2239 
2240 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2241 		struct request_sock *req = inet_reqsk(sk);
2242 		bool req_stolen = false;
2243 		struct sock *nsk;
2244 
2245 		sk = req->rsk_listener;
2246 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2247 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2248 		else
2249 			drop_reason = tcp_inbound_hash(sk, req, skb,
2250 						       &iph->saddr, &iph->daddr,
2251 						       AF_INET, dif, sdif);
2252 		if (unlikely(drop_reason)) {
2253 			sk_drops_add(sk, skb);
2254 			reqsk_put(req);
2255 			goto discard_it;
2256 		}
2257 		if (tcp_checksum_complete(skb)) {
2258 			reqsk_put(req);
2259 			goto csum_error;
2260 		}
2261 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2262 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2263 			if (!nsk) {
2264 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2265 				goto lookup;
2266 			}
2267 			sk = nsk;
2268 			/* reuseport_migrate_sock() has already held one sk_refcnt
2269 			 * before returning.
2270 			 */
2271 		} else {
2272 			/* We own a reference on the listener, increase it again
2273 			 * as we might lose it too soon.
2274 			 */
2275 			sock_hold(sk);
2276 		}
2277 		refcounted = true;
2278 		nsk = NULL;
2279 		if (!tcp_filter(sk, skb)) {
2280 			th = (const struct tcphdr *)skb->data;
2281 			iph = ip_hdr(skb);
2282 			tcp_v4_fill_cb(skb, iph, th);
2283 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2284 					    &drop_reason);
2285 		} else {
2286 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2287 		}
2288 		if (!nsk) {
2289 			reqsk_put(req);
2290 			if (req_stolen) {
2291 				/* Another cpu got exclusive access to req
2292 				 * and created a full blown socket.
2293 				 * Try to feed this packet to this socket
2294 				 * instead of discarding it.
2295 				 */
2296 				tcp_v4_restore_cb(skb);
2297 				sock_put(sk);
2298 				goto lookup;
2299 			}
2300 			goto discard_and_relse;
2301 		}
2302 		nf_reset_ct(skb);
2303 		if (nsk == sk) {
2304 			reqsk_put(req);
2305 			tcp_v4_restore_cb(skb);
2306 		} else {
2307 			drop_reason = tcp_child_process(sk, nsk, skb);
2308 			if (drop_reason) {
2309 				enum sk_rst_reason rst_reason;
2310 
2311 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2312 				tcp_v4_send_reset(nsk, skb, rst_reason);
2313 				goto discard_and_relse;
2314 			}
2315 			sock_put(sk);
2316 			return 0;
2317 		}
2318 	}
2319 
2320 process:
2321 	if (static_branch_unlikely(&ip4_min_ttl)) {
2322 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2323 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2324 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2325 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2326 			goto discard_and_relse;
2327 		}
2328 	}
2329 
2330 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2331 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2332 		goto discard_and_relse;
2333 	}
2334 
2335 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2336 				       AF_INET, dif, sdif);
2337 	if (drop_reason)
2338 		goto discard_and_relse;
2339 
2340 	nf_reset_ct(skb);
2341 
2342 	if (tcp_filter(sk, skb)) {
2343 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2344 		goto discard_and_relse;
2345 	}
2346 	th = (const struct tcphdr *)skb->data;
2347 	iph = ip_hdr(skb);
2348 	tcp_v4_fill_cb(skb, iph, th);
2349 
2350 	skb->dev = NULL;
2351 
2352 	if (sk->sk_state == TCP_LISTEN) {
2353 		ret = tcp_v4_do_rcv(sk, skb);
2354 		goto put_and_return;
2355 	}
2356 
2357 	sk_incoming_cpu_update(sk);
2358 
2359 	bh_lock_sock_nested(sk);
2360 	tcp_segs_in(tcp_sk(sk), skb);
2361 	ret = 0;
2362 	if (!sock_owned_by_user(sk)) {
2363 		ret = tcp_v4_do_rcv(sk, skb);
2364 	} else {
2365 		if (tcp_add_backlog(sk, skb, &drop_reason))
2366 			goto discard_and_relse;
2367 	}
2368 	bh_unlock_sock(sk);
2369 
2370 put_and_return:
2371 	if (refcounted)
2372 		sock_put(sk);
2373 
2374 	return ret;
2375 
2376 no_tcp_socket:
2377 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2378 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2379 		goto discard_it;
2380 
2381 	tcp_v4_fill_cb(skb, iph, th);
2382 
2383 	if (tcp_checksum_complete(skb)) {
2384 csum_error:
2385 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2386 		trace_tcp_bad_csum(skb);
2387 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2388 bad_packet:
2389 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2390 	} else {
2391 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2392 	}
2393 
2394 discard_it:
2395 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2396 	/* Discard frame. */
2397 	sk_skb_reason_drop(sk, skb, drop_reason);
2398 	return 0;
2399 
2400 discard_and_relse:
2401 	sk_drops_add(sk, skb);
2402 	if (refcounted)
2403 		sock_put(sk);
2404 	goto discard_it;
2405 
2406 do_time_wait:
2407 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2408 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2409 		inet_twsk_put(inet_twsk(sk));
2410 		goto discard_it;
2411 	}
2412 
2413 	tcp_v4_fill_cb(skb, iph, th);
2414 
2415 	if (tcp_checksum_complete(skb)) {
2416 		inet_twsk_put(inet_twsk(sk));
2417 		goto csum_error;
2418 	}
2419 
2420 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2421 					       &drop_reason);
2422 	switch (tw_status) {
2423 	case TCP_TW_SYN: {
2424 		struct sock *sk2 = inet_lookup_listener(net,
2425 							net->ipv4.tcp_death_row.hashinfo,
2426 							skb, __tcp_hdrlen(th),
2427 							iph->saddr, th->source,
2428 							iph->daddr, th->dest,
2429 							inet_iif(skb),
2430 							sdif);
2431 		if (sk2) {
2432 			inet_twsk_deschedule_put(inet_twsk(sk));
2433 			sk = sk2;
2434 			tcp_v4_restore_cb(skb);
2435 			refcounted = false;
2436 			__this_cpu_write(tcp_tw_isn, isn);
2437 			goto process;
2438 		}
2439 	}
2440 		/* to ACK */
2441 		fallthrough;
2442 	case TCP_TW_ACK:
2443 	case TCP_TW_ACK_OOW:
2444 		tcp_v4_timewait_ack(sk, skb, tw_status);
2445 		break;
2446 	case TCP_TW_RST:
2447 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2448 		inet_twsk_deschedule_put(inet_twsk(sk));
2449 		goto discard_it;
2450 	case TCP_TW_SUCCESS:;
2451 	}
2452 	goto discard_it;
2453 }
2454 
2455 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2456 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2457 	.twsk_destructor= tcp_twsk_destructor,
2458 };
2459 
2460 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2461 {
2462 	struct dst_entry *dst = skb_dst(skb);
2463 
2464 	if (dst && dst_hold_safe(dst)) {
2465 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2466 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2467 	}
2468 }
2469 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2470 
2471 const struct inet_connection_sock_af_ops ipv4_specific = {
2472 	.queue_xmit	   = ip_queue_xmit,
2473 	.send_check	   = tcp_v4_send_check,
2474 	.rebuild_header	   = inet_sk_rebuild_header,
2475 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2476 	.conn_request	   = tcp_v4_conn_request,
2477 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2478 	.net_header_len	   = sizeof(struct iphdr),
2479 	.setsockopt	   = ip_setsockopt,
2480 	.getsockopt	   = ip_getsockopt,
2481 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2482 };
2483 EXPORT_IPV6_MOD(ipv4_specific);
2484 
2485 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2486 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2487 #ifdef CONFIG_TCP_MD5SIG
2488 	.md5_lookup		= tcp_v4_md5_lookup,
2489 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2490 	.md5_parse		= tcp_v4_parse_md5_keys,
2491 #endif
2492 #ifdef CONFIG_TCP_AO
2493 	.ao_lookup		= tcp_v4_ao_lookup,
2494 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2495 	.ao_parse		= tcp_v4_parse_ao,
2496 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2497 #endif
2498 };
2499 #endif
2500 
2501 /* NOTE: A lot of things set to zero explicitly by call to
2502  *       sk_alloc() so need not be done here.
2503  */
2504 static int tcp_v4_init_sock(struct sock *sk)
2505 {
2506 	struct inet_connection_sock *icsk = inet_csk(sk);
2507 
2508 	tcp_init_sock(sk);
2509 
2510 	icsk->icsk_af_ops = &ipv4_specific;
2511 
2512 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2513 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2514 #endif
2515 
2516 	return 0;
2517 }
2518 
2519 #ifdef CONFIG_TCP_MD5SIG
2520 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2521 {
2522 	struct tcp_md5sig_info *md5sig;
2523 
2524 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2525 	kfree(md5sig);
2526 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2527 	tcp_md5_release_sigpool();
2528 }
2529 #endif
2530 
2531 static void tcp_release_user_frags(struct sock *sk)
2532 {
2533 #ifdef CONFIG_PAGE_POOL
2534 	unsigned long index;
2535 	void *netmem;
2536 
2537 	xa_for_each(&sk->sk_user_frags, index, netmem)
2538 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2539 #endif
2540 }
2541 
2542 void tcp_v4_destroy_sock(struct sock *sk)
2543 {
2544 	struct tcp_sock *tp = tcp_sk(sk);
2545 
2546 	tcp_release_user_frags(sk);
2547 
2548 	xa_destroy(&sk->sk_user_frags);
2549 
2550 	trace_tcp_destroy_sock(sk);
2551 
2552 	tcp_clear_xmit_timers(sk);
2553 
2554 	tcp_cleanup_congestion_control(sk);
2555 
2556 	tcp_cleanup_ulp(sk);
2557 
2558 	/* Cleanup up the write buffer. */
2559 	tcp_write_queue_purge(sk);
2560 
2561 	/* Check if we want to disable active TFO */
2562 	tcp_fastopen_active_disable_ofo_check(sk);
2563 
2564 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2565 	skb_rbtree_purge(&tp->out_of_order_queue);
2566 
2567 #ifdef CONFIG_TCP_MD5SIG
2568 	/* Clean up the MD5 key list, if any */
2569 	if (tp->md5sig_info) {
2570 		struct tcp_md5sig_info *md5sig;
2571 
2572 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2573 		tcp_clear_md5_list(sk);
2574 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2575 		rcu_assign_pointer(tp->md5sig_info, NULL);
2576 	}
2577 #endif
2578 	tcp_ao_destroy_sock(sk, false);
2579 
2580 	/* Clean up a referenced TCP bind bucket. */
2581 	if (inet_csk(sk)->icsk_bind_hash)
2582 		inet_put_port(sk);
2583 
2584 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2585 
2586 	/* If socket is aborted during connect operation */
2587 	tcp_free_fastopen_req(tp);
2588 	tcp_fastopen_destroy_cipher(sk);
2589 	tcp_saved_syn_free(tp);
2590 
2591 	sk_sockets_allocated_dec(sk);
2592 }
2593 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2594 
2595 #ifdef CONFIG_PROC_FS
2596 /* Proc filesystem TCP sock list dumping. */
2597 
2598 static unsigned short seq_file_family(const struct seq_file *seq);
2599 
2600 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2601 {
2602 	unsigned short family = seq_file_family(seq);
2603 
2604 	/* AF_UNSPEC is used as a match all */
2605 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2606 		net_eq(sock_net(sk), seq_file_net(seq)));
2607 }
2608 
2609 /* Find a non empty bucket (starting from st->bucket)
2610  * and return the first sk from it.
2611  */
2612 static void *listening_get_first(struct seq_file *seq)
2613 {
2614 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2615 	struct tcp_iter_state *st = seq->private;
2616 
2617 	st->offset = 0;
2618 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2619 		struct inet_listen_hashbucket *ilb2;
2620 		struct hlist_nulls_node *node;
2621 		struct sock *sk;
2622 
2623 		ilb2 = &hinfo->lhash2[st->bucket];
2624 		if (hlist_nulls_empty(&ilb2->nulls_head))
2625 			continue;
2626 
2627 		spin_lock(&ilb2->lock);
2628 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2629 			if (seq_sk_match(seq, sk))
2630 				return sk;
2631 		}
2632 		spin_unlock(&ilb2->lock);
2633 	}
2634 
2635 	return NULL;
2636 }
2637 
2638 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2639  * If "cur" is the last one in the st->bucket,
2640  * call listening_get_first() to return the first sk of the next
2641  * non empty bucket.
2642  */
2643 static void *listening_get_next(struct seq_file *seq, void *cur)
2644 {
2645 	struct tcp_iter_state *st = seq->private;
2646 	struct inet_listen_hashbucket *ilb2;
2647 	struct hlist_nulls_node *node;
2648 	struct inet_hashinfo *hinfo;
2649 	struct sock *sk = cur;
2650 
2651 	++st->num;
2652 	++st->offset;
2653 
2654 	sk = sk_nulls_next(sk);
2655 	sk_nulls_for_each_from(sk, node) {
2656 		if (seq_sk_match(seq, sk))
2657 			return sk;
2658 	}
2659 
2660 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2661 	ilb2 = &hinfo->lhash2[st->bucket];
2662 	spin_unlock(&ilb2->lock);
2663 	++st->bucket;
2664 	return listening_get_first(seq);
2665 }
2666 
2667 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2668 {
2669 	struct tcp_iter_state *st = seq->private;
2670 	void *rc;
2671 
2672 	st->bucket = 0;
2673 	st->offset = 0;
2674 	rc = listening_get_first(seq);
2675 
2676 	while (rc && *pos) {
2677 		rc = listening_get_next(seq, rc);
2678 		--*pos;
2679 	}
2680 	return rc;
2681 }
2682 
2683 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2684 				const struct tcp_iter_state *st)
2685 {
2686 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2687 }
2688 
2689 /*
2690  * Get first established socket starting from bucket given in st->bucket.
2691  * If st->bucket is zero, the very first socket in the hash is returned.
2692  */
2693 static void *established_get_first(struct seq_file *seq)
2694 {
2695 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2696 	struct tcp_iter_state *st = seq->private;
2697 
2698 	st->offset = 0;
2699 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2700 		struct sock *sk;
2701 		struct hlist_nulls_node *node;
2702 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2703 
2704 		cond_resched();
2705 
2706 		/* Lockless fast path for the common case of empty buckets */
2707 		if (empty_bucket(hinfo, st))
2708 			continue;
2709 
2710 		spin_lock_bh(lock);
2711 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2712 			if (seq_sk_match(seq, sk))
2713 				return sk;
2714 		}
2715 		spin_unlock_bh(lock);
2716 	}
2717 
2718 	return NULL;
2719 }
2720 
2721 static void *established_get_next(struct seq_file *seq, void *cur)
2722 {
2723 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2724 	struct tcp_iter_state *st = seq->private;
2725 	struct hlist_nulls_node *node;
2726 	struct sock *sk = cur;
2727 
2728 	++st->num;
2729 	++st->offset;
2730 
2731 	sk = sk_nulls_next(sk);
2732 
2733 	sk_nulls_for_each_from(sk, node) {
2734 		if (seq_sk_match(seq, sk))
2735 			return sk;
2736 	}
2737 
2738 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2739 	++st->bucket;
2740 	return established_get_first(seq);
2741 }
2742 
2743 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2744 {
2745 	struct tcp_iter_state *st = seq->private;
2746 	void *rc;
2747 
2748 	st->bucket = 0;
2749 	rc = established_get_first(seq);
2750 
2751 	while (rc && pos) {
2752 		rc = established_get_next(seq, rc);
2753 		--pos;
2754 	}
2755 	return rc;
2756 }
2757 
2758 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2759 {
2760 	void *rc;
2761 	struct tcp_iter_state *st = seq->private;
2762 
2763 	st->state = TCP_SEQ_STATE_LISTENING;
2764 	rc	  = listening_get_idx(seq, &pos);
2765 
2766 	if (!rc) {
2767 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2768 		rc	  = established_get_idx(seq, pos);
2769 	}
2770 
2771 	return rc;
2772 }
2773 
2774 static void *tcp_seek_last_pos(struct seq_file *seq)
2775 {
2776 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2777 	struct tcp_iter_state *st = seq->private;
2778 	int bucket = st->bucket;
2779 	int offset = st->offset;
2780 	int orig_num = st->num;
2781 	void *rc = NULL;
2782 
2783 	switch (st->state) {
2784 	case TCP_SEQ_STATE_LISTENING:
2785 		if (st->bucket > hinfo->lhash2_mask)
2786 			break;
2787 		rc = listening_get_first(seq);
2788 		while (offset-- && rc && bucket == st->bucket)
2789 			rc = listening_get_next(seq, rc);
2790 		if (rc)
2791 			break;
2792 		st->bucket = 0;
2793 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2794 		fallthrough;
2795 	case TCP_SEQ_STATE_ESTABLISHED:
2796 		if (st->bucket > hinfo->ehash_mask)
2797 			break;
2798 		rc = established_get_first(seq);
2799 		while (offset-- && rc && bucket == st->bucket)
2800 			rc = established_get_next(seq, rc);
2801 	}
2802 
2803 	st->num = orig_num;
2804 
2805 	return rc;
2806 }
2807 
2808 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2809 {
2810 	struct tcp_iter_state *st = seq->private;
2811 	void *rc;
2812 
2813 	if (*pos && *pos == st->last_pos) {
2814 		rc = tcp_seek_last_pos(seq);
2815 		if (rc)
2816 			goto out;
2817 	}
2818 
2819 	st->state = TCP_SEQ_STATE_LISTENING;
2820 	st->num = 0;
2821 	st->bucket = 0;
2822 	st->offset = 0;
2823 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2824 
2825 out:
2826 	st->last_pos = *pos;
2827 	return rc;
2828 }
2829 EXPORT_IPV6_MOD(tcp_seq_start);
2830 
2831 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2832 {
2833 	struct tcp_iter_state *st = seq->private;
2834 	void *rc = NULL;
2835 
2836 	if (v == SEQ_START_TOKEN) {
2837 		rc = tcp_get_idx(seq, 0);
2838 		goto out;
2839 	}
2840 
2841 	switch (st->state) {
2842 	case TCP_SEQ_STATE_LISTENING:
2843 		rc = listening_get_next(seq, v);
2844 		if (!rc) {
2845 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2846 			st->bucket = 0;
2847 			st->offset = 0;
2848 			rc	  = established_get_first(seq);
2849 		}
2850 		break;
2851 	case TCP_SEQ_STATE_ESTABLISHED:
2852 		rc = established_get_next(seq, v);
2853 		break;
2854 	}
2855 out:
2856 	++*pos;
2857 	st->last_pos = *pos;
2858 	return rc;
2859 }
2860 EXPORT_IPV6_MOD(tcp_seq_next);
2861 
2862 void tcp_seq_stop(struct seq_file *seq, void *v)
2863 {
2864 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2865 	struct tcp_iter_state *st = seq->private;
2866 
2867 	switch (st->state) {
2868 	case TCP_SEQ_STATE_LISTENING:
2869 		if (v != SEQ_START_TOKEN)
2870 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2871 		break;
2872 	case TCP_SEQ_STATE_ESTABLISHED:
2873 		if (v)
2874 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2875 		break;
2876 	}
2877 }
2878 EXPORT_IPV6_MOD(tcp_seq_stop);
2879 
2880 static void get_openreq4(const struct request_sock *req,
2881 			 struct seq_file *f, int i)
2882 {
2883 	const struct inet_request_sock *ireq = inet_rsk(req);
2884 	long delta = req->rsk_timer.expires - jiffies;
2885 
2886 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2887 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2888 		i,
2889 		ireq->ir_loc_addr,
2890 		ireq->ir_num,
2891 		ireq->ir_rmt_addr,
2892 		ntohs(ireq->ir_rmt_port),
2893 		TCP_SYN_RECV,
2894 		0, 0, /* could print option size, but that is af dependent. */
2895 		1,    /* timers active (only the expire timer) */
2896 		jiffies_delta_to_clock_t(delta),
2897 		req->num_timeout,
2898 		from_kuid_munged(seq_user_ns(f),
2899 				 sk_uid(req->rsk_listener)),
2900 		0,  /* non standard timer */
2901 		0, /* open_requests have no inode */
2902 		0,
2903 		req);
2904 }
2905 
2906 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2907 {
2908 	int timer_active;
2909 	unsigned long timer_expires;
2910 	const struct tcp_sock *tp = tcp_sk(sk);
2911 	const struct inet_connection_sock *icsk = inet_csk(sk);
2912 	const struct inet_sock *inet = inet_sk(sk);
2913 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2914 	__be32 dest = inet->inet_daddr;
2915 	__be32 src = inet->inet_rcv_saddr;
2916 	__u16 destp = ntohs(inet->inet_dport);
2917 	__u16 srcp = ntohs(inet->inet_sport);
2918 	u8 icsk_pending;
2919 	int rx_queue;
2920 	int state;
2921 
2922 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2923 	if (icsk_pending == ICSK_TIME_RETRANS ||
2924 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2925 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2926 		timer_active	= 1;
2927 		timer_expires	= icsk_timeout(icsk);
2928 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2929 		timer_active	= 4;
2930 		timer_expires	= icsk_timeout(icsk);
2931 	} else if (timer_pending(&sk->sk_timer)) {
2932 		timer_active	= 2;
2933 		timer_expires	= sk->sk_timer.expires;
2934 	} else {
2935 		timer_active	= 0;
2936 		timer_expires = jiffies;
2937 	}
2938 
2939 	state = inet_sk_state_load(sk);
2940 	if (state == TCP_LISTEN)
2941 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2942 	else
2943 		/* Because we don't lock the socket,
2944 		 * we might find a transient negative value.
2945 		 */
2946 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2947 				      READ_ONCE(tp->copied_seq), 0);
2948 
2949 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2950 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2951 		i, src, srcp, dest, destp, state,
2952 		READ_ONCE(tp->write_seq) - tp->snd_una,
2953 		rx_queue,
2954 		timer_active,
2955 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2956 		icsk->icsk_retransmits,
2957 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2958 		icsk->icsk_probes_out,
2959 		sock_i_ino(sk),
2960 		refcount_read(&sk->sk_refcnt), sk,
2961 		jiffies_to_clock_t(icsk->icsk_rto),
2962 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2963 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2964 		tcp_snd_cwnd(tp),
2965 		state == TCP_LISTEN ?
2966 		    fastopenq->max_qlen :
2967 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2968 }
2969 
2970 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2971 			       struct seq_file *f, int i)
2972 {
2973 	long delta = tw->tw_timer.expires - jiffies;
2974 	__be32 dest, src;
2975 	__u16 destp, srcp;
2976 
2977 	dest  = tw->tw_daddr;
2978 	src   = tw->tw_rcv_saddr;
2979 	destp = ntohs(tw->tw_dport);
2980 	srcp  = ntohs(tw->tw_sport);
2981 
2982 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2983 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2984 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2985 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2986 		refcount_read(&tw->tw_refcnt), tw);
2987 }
2988 
2989 #define TMPSZ 150
2990 
2991 static int tcp4_seq_show(struct seq_file *seq, void *v)
2992 {
2993 	struct tcp_iter_state *st;
2994 	struct sock *sk = v;
2995 
2996 	seq_setwidth(seq, TMPSZ - 1);
2997 	if (v == SEQ_START_TOKEN) {
2998 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2999 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
3000 			   "inode");
3001 		goto out;
3002 	}
3003 	st = seq->private;
3004 
3005 	if (sk->sk_state == TCP_TIME_WAIT)
3006 		get_timewait4_sock(v, seq, st->num);
3007 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
3008 		get_openreq4(v, seq, st->num);
3009 	else
3010 		get_tcp4_sock(v, seq, st->num);
3011 out:
3012 	seq_pad(seq, '\n');
3013 	return 0;
3014 }
3015 
3016 #ifdef CONFIG_BPF_SYSCALL
3017 struct bpf_tcp_iter_state {
3018 	struct tcp_iter_state state;
3019 	unsigned int cur_sk;
3020 	unsigned int end_sk;
3021 	unsigned int max_sk;
3022 	struct sock **batch;
3023 	bool st_bucket_done;
3024 };
3025 
3026 struct bpf_iter__tcp {
3027 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3028 	__bpf_md_ptr(struct sock_common *, sk_common);
3029 	uid_t uid __aligned(8);
3030 };
3031 
3032 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3033 			     struct sock_common *sk_common, uid_t uid)
3034 {
3035 	struct bpf_iter__tcp ctx;
3036 
3037 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3038 	ctx.meta = meta;
3039 	ctx.sk_common = sk_common;
3040 	ctx.uid = uid;
3041 	return bpf_iter_run_prog(prog, &ctx);
3042 }
3043 
3044 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3045 {
3046 	while (iter->cur_sk < iter->end_sk)
3047 		sock_gen_put(iter->batch[iter->cur_sk++]);
3048 }
3049 
3050 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3051 				      unsigned int new_batch_sz)
3052 {
3053 	struct sock **new_batch;
3054 
3055 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3056 			     GFP_USER | __GFP_NOWARN);
3057 	if (!new_batch)
3058 		return -ENOMEM;
3059 
3060 	bpf_iter_tcp_put_batch(iter);
3061 	kvfree(iter->batch);
3062 	iter->batch = new_batch;
3063 	iter->max_sk = new_batch_sz;
3064 
3065 	return 0;
3066 }
3067 
3068 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3069 						 struct sock *start_sk)
3070 {
3071 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3072 	struct bpf_tcp_iter_state *iter = seq->private;
3073 	struct tcp_iter_state *st = &iter->state;
3074 	struct hlist_nulls_node *node;
3075 	unsigned int expected = 1;
3076 	struct sock *sk;
3077 
3078 	sock_hold(start_sk);
3079 	iter->batch[iter->end_sk++] = start_sk;
3080 
3081 	sk = sk_nulls_next(start_sk);
3082 	sk_nulls_for_each_from(sk, node) {
3083 		if (seq_sk_match(seq, sk)) {
3084 			if (iter->end_sk < iter->max_sk) {
3085 				sock_hold(sk);
3086 				iter->batch[iter->end_sk++] = sk;
3087 			}
3088 			expected++;
3089 		}
3090 	}
3091 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3092 
3093 	return expected;
3094 }
3095 
3096 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3097 						   struct sock *start_sk)
3098 {
3099 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3100 	struct bpf_tcp_iter_state *iter = seq->private;
3101 	struct tcp_iter_state *st = &iter->state;
3102 	struct hlist_nulls_node *node;
3103 	unsigned int expected = 1;
3104 	struct sock *sk;
3105 
3106 	sock_hold(start_sk);
3107 	iter->batch[iter->end_sk++] = start_sk;
3108 
3109 	sk = sk_nulls_next(start_sk);
3110 	sk_nulls_for_each_from(sk, node) {
3111 		if (seq_sk_match(seq, sk)) {
3112 			if (iter->end_sk < iter->max_sk) {
3113 				sock_hold(sk);
3114 				iter->batch[iter->end_sk++] = sk;
3115 			}
3116 			expected++;
3117 		}
3118 	}
3119 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3120 
3121 	return expected;
3122 }
3123 
3124 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3125 {
3126 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3127 	struct bpf_tcp_iter_state *iter = seq->private;
3128 	struct tcp_iter_state *st = &iter->state;
3129 	unsigned int expected;
3130 	bool resized = false;
3131 	struct sock *sk;
3132 
3133 	/* The st->bucket is done.  Directly advance to the next
3134 	 * bucket instead of having the tcp_seek_last_pos() to skip
3135 	 * one by one in the current bucket and eventually find out
3136 	 * it has to advance to the next bucket.
3137 	 */
3138 	if (iter->st_bucket_done) {
3139 		st->offset = 0;
3140 		st->bucket++;
3141 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3142 		    st->bucket > hinfo->lhash2_mask) {
3143 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3144 			st->bucket = 0;
3145 		}
3146 	}
3147 
3148 again:
3149 	/* Get a new batch */
3150 	iter->cur_sk = 0;
3151 	iter->end_sk = 0;
3152 	iter->st_bucket_done = false;
3153 
3154 	sk = tcp_seek_last_pos(seq);
3155 	if (!sk)
3156 		return NULL; /* Done */
3157 
3158 	if (st->state == TCP_SEQ_STATE_LISTENING)
3159 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3160 	else
3161 		expected = bpf_iter_tcp_established_batch(seq, sk);
3162 
3163 	if (iter->end_sk == expected) {
3164 		iter->st_bucket_done = true;
3165 		return sk;
3166 	}
3167 
3168 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3169 		resized = true;
3170 		goto again;
3171 	}
3172 
3173 	return sk;
3174 }
3175 
3176 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3177 {
3178 	/* bpf iter does not support lseek, so it always
3179 	 * continue from where it was stop()-ped.
3180 	 */
3181 	if (*pos)
3182 		return bpf_iter_tcp_batch(seq);
3183 
3184 	return SEQ_START_TOKEN;
3185 }
3186 
3187 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3188 {
3189 	struct bpf_tcp_iter_state *iter = seq->private;
3190 	struct tcp_iter_state *st = &iter->state;
3191 	struct sock *sk;
3192 
3193 	/* Whenever seq_next() is called, the iter->cur_sk is
3194 	 * done with seq_show(), so advance to the next sk in
3195 	 * the batch.
3196 	 */
3197 	if (iter->cur_sk < iter->end_sk) {
3198 		/* Keeping st->num consistent in tcp_iter_state.
3199 		 * bpf_iter_tcp does not use st->num.
3200 		 * meta.seq_num is used instead.
3201 		 */
3202 		st->num++;
3203 		/* Move st->offset to the next sk in the bucket such that
3204 		 * the future start() will resume at st->offset in
3205 		 * st->bucket.  See tcp_seek_last_pos().
3206 		 */
3207 		st->offset++;
3208 		sock_gen_put(iter->batch[iter->cur_sk++]);
3209 	}
3210 
3211 	if (iter->cur_sk < iter->end_sk)
3212 		sk = iter->batch[iter->cur_sk];
3213 	else
3214 		sk = bpf_iter_tcp_batch(seq);
3215 
3216 	++*pos;
3217 	/* Keeping st->last_pos consistent in tcp_iter_state.
3218 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3219 	 */
3220 	st->last_pos = *pos;
3221 	return sk;
3222 }
3223 
3224 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3225 {
3226 	struct bpf_iter_meta meta;
3227 	struct bpf_prog *prog;
3228 	struct sock *sk = v;
3229 	uid_t uid;
3230 	int ret;
3231 
3232 	if (v == SEQ_START_TOKEN)
3233 		return 0;
3234 
3235 	if (sk_fullsock(sk))
3236 		lock_sock(sk);
3237 
3238 	if (unlikely(sk_unhashed(sk))) {
3239 		ret = SEQ_SKIP;
3240 		goto unlock;
3241 	}
3242 
3243 	if (sk->sk_state == TCP_TIME_WAIT) {
3244 		uid = 0;
3245 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3246 		const struct request_sock *req = v;
3247 
3248 		uid = from_kuid_munged(seq_user_ns(seq),
3249 				       sk_uid(req->rsk_listener));
3250 	} else {
3251 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3252 	}
3253 
3254 	meta.seq = seq;
3255 	prog = bpf_iter_get_info(&meta, false);
3256 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3257 
3258 unlock:
3259 	if (sk_fullsock(sk))
3260 		release_sock(sk);
3261 	return ret;
3262 
3263 }
3264 
3265 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3266 {
3267 	struct bpf_tcp_iter_state *iter = seq->private;
3268 	struct bpf_iter_meta meta;
3269 	struct bpf_prog *prog;
3270 
3271 	if (!v) {
3272 		meta.seq = seq;
3273 		prog = bpf_iter_get_info(&meta, true);
3274 		if (prog)
3275 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3276 	}
3277 
3278 	if (iter->cur_sk < iter->end_sk) {
3279 		bpf_iter_tcp_put_batch(iter);
3280 		iter->st_bucket_done = false;
3281 	}
3282 }
3283 
3284 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3285 	.show		= bpf_iter_tcp_seq_show,
3286 	.start		= bpf_iter_tcp_seq_start,
3287 	.next		= bpf_iter_tcp_seq_next,
3288 	.stop		= bpf_iter_tcp_seq_stop,
3289 };
3290 #endif
3291 static unsigned short seq_file_family(const struct seq_file *seq)
3292 {
3293 	const struct tcp_seq_afinfo *afinfo;
3294 
3295 #ifdef CONFIG_BPF_SYSCALL
3296 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3297 	if (seq->op == &bpf_iter_tcp_seq_ops)
3298 		return AF_UNSPEC;
3299 #endif
3300 
3301 	/* Iterated from proc fs */
3302 	afinfo = pde_data(file_inode(seq->file));
3303 	return afinfo->family;
3304 }
3305 
3306 static const struct seq_operations tcp4_seq_ops = {
3307 	.show		= tcp4_seq_show,
3308 	.start		= tcp_seq_start,
3309 	.next		= tcp_seq_next,
3310 	.stop		= tcp_seq_stop,
3311 };
3312 
3313 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3314 	.family		= AF_INET,
3315 };
3316 
3317 static int __net_init tcp4_proc_init_net(struct net *net)
3318 {
3319 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3320 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3321 		return -ENOMEM;
3322 	return 0;
3323 }
3324 
3325 static void __net_exit tcp4_proc_exit_net(struct net *net)
3326 {
3327 	remove_proc_entry("tcp", net->proc_net);
3328 }
3329 
3330 static struct pernet_operations tcp4_net_ops = {
3331 	.init = tcp4_proc_init_net,
3332 	.exit = tcp4_proc_exit_net,
3333 };
3334 
3335 int __init tcp4_proc_init(void)
3336 {
3337 	return register_pernet_subsys(&tcp4_net_ops);
3338 }
3339 
3340 void tcp4_proc_exit(void)
3341 {
3342 	unregister_pernet_subsys(&tcp4_net_ops);
3343 }
3344 #endif /* CONFIG_PROC_FS */
3345 
3346 /* @wake is one when sk_stream_write_space() calls us.
3347  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3348  * This mimics the strategy used in sock_def_write_space().
3349  */
3350 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3351 {
3352 	const struct tcp_sock *tp = tcp_sk(sk);
3353 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3354 			    READ_ONCE(tp->snd_nxt);
3355 
3356 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3357 }
3358 EXPORT_SYMBOL(tcp_stream_memory_free);
3359 
3360 struct proto tcp_prot = {
3361 	.name			= "TCP",
3362 	.owner			= THIS_MODULE,
3363 	.close			= tcp_close,
3364 	.pre_connect		= tcp_v4_pre_connect,
3365 	.connect		= tcp_v4_connect,
3366 	.disconnect		= tcp_disconnect,
3367 	.accept			= inet_csk_accept,
3368 	.ioctl			= tcp_ioctl,
3369 	.init			= tcp_v4_init_sock,
3370 	.destroy		= tcp_v4_destroy_sock,
3371 	.shutdown		= tcp_shutdown,
3372 	.setsockopt		= tcp_setsockopt,
3373 	.getsockopt		= tcp_getsockopt,
3374 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3375 	.keepalive		= tcp_set_keepalive,
3376 	.recvmsg		= tcp_recvmsg,
3377 	.sendmsg		= tcp_sendmsg,
3378 	.splice_eof		= tcp_splice_eof,
3379 	.backlog_rcv		= tcp_v4_do_rcv,
3380 	.release_cb		= tcp_release_cb,
3381 	.hash			= inet_hash,
3382 	.unhash			= inet_unhash,
3383 	.get_port		= inet_csk_get_port,
3384 	.put_port		= inet_put_port,
3385 #ifdef CONFIG_BPF_SYSCALL
3386 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3387 #endif
3388 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3389 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3390 	.stream_memory_free	= tcp_stream_memory_free,
3391 	.sockets_allocated	= &tcp_sockets_allocated,
3392 	.orphan_count		= &tcp_orphan_count,
3393 
3394 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3395 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3396 
3397 	.memory_pressure	= &tcp_memory_pressure,
3398 	.sysctl_mem		= sysctl_tcp_mem,
3399 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3400 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3401 	.max_header		= MAX_TCP_HEADER,
3402 	.obj_size		= sizeof(struct tcp_sock),
3403 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3404 	.twsk_prot		= &tcp_timewait_sock_ops,
3405 	.rsk_prot		= &tcp_request_sock_ops,
3406 	.h.hashinfo		= NULL,
3407 	.no_autobind		= true,
3408 	.diag_destroy		= tcp_abort,
3409 };
3410 EXPORT_SYMBOL(tcp_prot);
3411 
3412 static void __net_exit tcp_sk_exit(struct net *net)
3413 {
3414 	if (net->ipv4.tcp_congestion_control)
3415 		bpf_module_put(net->ipv4.tcp_congestion_control,
3416 			       net->ipv4.tcp_congestion_control->owner);
3417 }
3418 
3419 static void __net_init tcp_set_hashinfo(struct net *net)
3420 {
3421 	struct inet_hashinfo *hinfo;
3422 	unsigned int ehash_entries;
3423 	struct net *old_net;
3424 
3425 	if (net_eq(net, &init_net))
3426 		goto fallback;
3427 
3428 	old_net = current->nsproxy->net_ns;
3429 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3430 	if (!ehash_entries)
3431 		goto fallback;
3432 
3433 	ehash_entries = roundup_pow_of_two(ehash_entries);
3434 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3435 	if (!hinfo) {
3436 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3437 			"for a netns, fallback to the global one\n",
3438 			ehash_entries);
3439 fallback:
3440 		hinfo = &tcp_hashinfo;
3441 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3442 	}
3443 
3444 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3445 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3446 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3447 }
3448 
3449 static int __net_init tcp_sk_init(struct net *net)
3450 {
3451 	net->ipv4.sysctl_tcp_ecn = 2;
3452 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3453 
3454 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3455 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3456 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3457 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3458 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3459 
3460 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3461 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3462 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3463 
3464 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3465 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3466 	net->ipv4.sysctl_tcp_syncookies = 1;
3467 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3468 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3469 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3470 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3471 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3472 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3473 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3474 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3475 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3476 
3477 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3478 	tcp_set_hashinfo(net);
3479 
3480 	net->ipv4.sysctl_tcp_sack = 1;
3481 	net->ipv4.sysctl_tcp_window_scaling = 1;
3482 	net->ipv4.sysctl_tcp_timestamps = 1;
3483 	net->ipv4.sysctl_tcp_early_retrans = 3;
3484 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3485 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3486 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3487 	net->ipv4.sysctl_tcp_max_reordering = 300;
3488 	net->ipv4.sysctl_tcp_dsack = 1;
3489 	net->ipv4.sysctl_tcp_app_win = 31;
3490 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3491 	net->ipv4.sysctl_tcp_frto = 2;
3492 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3493 	/* This limits the percentage of the congestion window which we
3494 	 * will allow a single TSO frame to consume.  Building TSO frames
3495 	 * which are too large can cause TCP streams to be bursty.
3496 	 */
3497 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3498 	/* Default TSQ limit of 4 MB */
3499 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3500 
3501 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3502 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3503 
3504 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3505 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3506 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3507 	net->ipv4.sysctl_tcp_autocorking = 1;
3508 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3509 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3510 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3511 	if (net != &init_net) {
3512 		memcpy(net->ipv4.sysctl_tcp_rmem,
3513 		       init_net.ipv4.sysctl_tcp_rmem,
3514 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3515 		memcpy(net->ipv4.sysctl_tcp_wmem,
3516 		       init_net.ipv4.sysctl_tcp_wmem,
3517 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3518 	}
3519 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3520 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3521 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3522 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3523 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3524 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3525 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3526 
3527 	/* Set default values for PLB */
3528 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3529 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3530 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3531 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3532 	/* Default congestion threshold for PLB to mark a round is 50% */
3533 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3534 
3535 	/* Reno is always built in */
3536 	if (!net_eq(net, &init_net) &&
3537 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3538 			       init_net.ipv4.tcp_congestion_control->owner))
3539 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3540 	else
3541 		net->ipv4.tcp_congestion_control = &tcp_reno;
3542 
3543 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3544 	net->ipv4.sysctl_tcp_shrink_window = 0;
3545 
3546 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3547 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3548 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3549 
3550 	return 0;
3551 }
3552 
3553 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3554 {
3555 	struct net *net;
3556 
3557 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3558 	 * and failed setup_net error unwinding path are serialized.
3559 	 *
3560 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3561 	 * net_exit_list, the thread that dismantles a particular twsk must
3562 	 * do so without other thread progressing to refcount_dec_and_test() of
3563 	 * tcp_death_row.tw_refcount.
3564 	 */
3565 	mutex_lock(&tcp_exit_batch_mutex);
3566 
3567 	tcp_twsk_purge(net_exit_list);
3568 
3569 	list_for_each_entry(net, net_exit_list, exit_list) {
3570 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3571 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3572 		tcp_fastopen_ctx_destroy(net);
3573 	}
3574 
3575 	mutex_unlock(&tcp_exit_batch_mutex);
3576 }
3577 
3578 static struct pernet_operations __net_initdata tcp_sk_ops = {
3579        .init	   = tcp_sk_init,
3580        .exit	   = tcp_sk_exit,
3581        .exit_batch = tcp_sk_exit_batch,
3582 };
3583 
3584 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3585 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3586 		     struct sock_common *sk_common, uid_t uid)
3587 
3588 #define INIT_BATCH_SZ 16
3589 
3590 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3591 {
3592 	struct bpf_tcp_iter_state *iter = priv_data;
3593 	int err;
3594 
3595 	err = bpf_iter_init_seq_net(priv_data, aux);
3596 	if (err)
3597 		return err;
3598 
3599 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3600 	if (err) {
3601 		bpf_iter_fini_seq_net(priv_data);
3602 		return err;
3603 	}
3604 
3605 	return 0;
3606 }
3607 
3608 static void bpf_iter_fini_tcp(void *priv_data)
3609 {
3610 	struct bpf_tcp_iter_state *iter = priv_data;
3611 
3612 	bpf_iter_fini_seq_net(priv_data);
3613 	kvfree(iter->batch);
3614 }
3615 
3616 static const struct bpf_iter_seq_info tcp_seq_info = {
3617 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3618 	.init_seq_private	= bpf_iter_init_tcp,
3619 	.fini_seq_private	= bpf_iter_fini_tcp,
3620 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3621 };
3622 
3623 static const struct bpf_func_proto *
3624 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3625 			    const struct bpf_prog *prog)
3626 {
3627 	switch (func_id) {
3628 	case BPF_FUNC_setsockopt:
3629 		return &bpf_sk_setsockopt_proto;
3630 	case BPF_FUNC_getsockopt:
3631 		return &bpf_sk_getsockopt_proto;
3632 	default:
3633 		return NULL;
3634 	}
3635 }
3636 
3637 static struct bpf_iter_reg tcp_reg_info = {
3638 	.target			= "tcp",
3639 	.ctx_arg_info_size	= 1,
3640 	.ctx_arg_info		= {
3641 		{ offsetof(struct bpf_iter__tcp, sk_common),
3642 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3643 	},
3644 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3645 	.seq_info		= &tcp_seq_info,
3646 };
3647 
3648 static void __init bpf_iter_register(void)
3649 {
3650 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3651 	if (bpf_iter_reg_target(&tcp_reg_info))
3652 		pr_warn("Warning: could not register bpf iterator tcp\n");
3653 }
3654 
3655 #endif
3656 
3657 void __init tcp_v4_init(void)
3658 {
3659 	int cpu, res;
3660 
3661 	for_each_possible_cpu(cpu) {
3662 		struct sock *sk;
3663 
3664 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3665 					   IPPROTO_TCP, &init_net);
3666 		if (res)
3667 			panic("Failed to create the TCP control socket.\n");
3668 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3669 
3670 		/* Please enforce IP_DF and IPID==0 for RST and
3671 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3672 		 */
3673 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3674 
3675 		sk->sk_clockid = CLOCK_MONOTONIC;
3676 
3677 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3678 	}
3679 	if (register_pernet_subsys(&tcp_sk_ops))
3680 		panic("Failed to create the TCP control socket.\n");
3681 
3682 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3683 	bpf_iter_register();
3684 #endif
3685 }
3686