xref: /linux/net/ipv4/tcp_ipv4.c (revision 07fdad3a93756b872da7b53647715c48d0f4a2d0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 #include <linux/sock_diag.h>
62 
63 #include <net/aligned_data.h>
64 #include <net/net_namespace.h>
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/tcp_ecn.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/inet_ecn.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 #include <net/rstreason.h>
78 #include <net/psp.h>
79 
80 #include <linux/inet.h>
81 #include <linux/ipv6.h>
82 #include <linux/stddef.h>
83 #include <linux/proc_fs.h>
84 #include <linux/seq_file.h>
85 #include <linux/inetdevice.h>
86 #include <linux/btf_ids.h>
87 #include <linux/skbuff_ref.h>
88 
89 #include <crypto/hash.h>
90 #include <linux/scatterlist.h>
91 
92 #include <trace/events/tcp.h>
93 
94 #ifdef CONFIG_TCP_MD5SIG
95 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
96 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
97 #endif
98 
99 struct inet_hashinfo tcp_hashinfo;
100 
101 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
102 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
103 };
104 
105 static DEFINE_MUTEX(tcp_exit_batch_mutex);
106 
107 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
108 {
109 	return secure_tcp_seq(ip_hdr(skb)->daddr,
110 			      ip_hdr(skb)->saddr,
111 			      tcp_hdr(skb)->dest,
112 			      tcp_hdr(skb)->source);
113 }
114 
115 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
116 {
117 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
118 }
119 
120 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
121 {
122 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
123 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
124 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
125 	struct tcp_sock *tp = tcp_sk(sk);
126 	int ts_recent_stamp;
127 	u32 reuse_thresh;
128 
129 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
130 		reuse = 0;
131 
132 	if (reuse == 2) {
133 		/* Still does not detect *everything* that goes through
134 		 * lo, since we require a loopback src or dst address
135 		 * or direct binding to 'lo' interface.
136 		 */
137 		bool loopback = false;
138 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
139 			loopback = true;
140 #if IS_ENABLED(CONFIG_IPV6)
141 		if (tw->tw_family == AF_INET6) {
142 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
143 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
144 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
145 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
146 				loopback = true;
147 		} else
148 #endif
149 		{
150 			if (ipv4_is_loopback(tw->tw_daddr) ||
151 			    ipv4_is_loopback(tw->tw_rcv_saddr))
152 				loopback = true;
153 		}
154 		if (!loopback)
155 			reuse = 0;
156 	}
157 
158 	/* With PAWS, it is safe from the viewpoint
159 	   of data integrity. Even without PAWS it is safe provided sequence
160 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
161 
162 	   Actually, the idea is close to VJ's one, only timestamp cache is
163 	   held not per host, but per port pair and TW bucket is used as state
164 	   holder.
165 
166 	   If TW bucket has been already destroyed we fall back to VJ's scheme
167 	   and use initial timestamp retrieved from peer table.
168 	 */
169 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
170 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
171 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
172 	if (ts_recent_stamp &&
173 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
174 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
175 		 * and releasing the bucket lock.
176 		 */
177 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
178 			return 0;
179 
180 		/* In case of repair and re-using TIME-WAIT sockets we still
181 		 * want to be sure that it is safe as above but honor the
182 		 * sequence numbers and time stamps set as part of the repair
183 		 * process.
184 		 *
185 		 * Without this check re-using a TIME-WAIT socket with TCP
186 		 * repair would accumulate a -1 on the repair assigned
187 		 * sequence number. The first time it is reused the sequence
188 		 * is -1, the second time -2, etc. This fixes that issue
189 		 * without appearing to create any others.
190 		 */
191 		if (likely(!tp->repair)) {
192 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
193 
194 			if (!seq)
195 				seq = 1;
196 			WRITE_ONCE(tp->write_seq, seq);
197 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
198 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
199 		}
200 
201 		return 1;
202 	}
203 
204 	return 0;
205 }
206 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
207 
208 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
209 			      int addr_len)
210 {
211 	/* This check is replicated from tcp_v4_connect() and intended to
212 	 * prevent BPF program called below from accessing bytes that are out
213 	 * of the bound specified by user in addr_len.
214 	 */
215 	if (addr_len < sizeof(struct sockaddr_in))
216 		return -EINVAL;
217 
218 	sock_owned_by_me(sk);
219 
220 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
221 }
222 
223 /* This will initiate an outgoing connection. */
224 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
225 {
226 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
227 	struct inet_timewait_death_row *tcp_death_row;
228 	struct inet_sock *inet = inet_sk(sk);
229 	struct tcp_sock *tp = tcp_sk(sk);
230 	struct ip_options_rcu *inet_opt;
231 	struct net *net = sock_net(sk);
232 	__be16 orig_sport, orig_dport;
233 	__be32 daddr, nexthop;
234 	struct flowi4 *fl4;
235 	struct rtable *rt;
236 	int err;
237 
238 	if (addr_len < sizeof(struct sockaddr_in))
239 		return -EINVAL;
240 
241 	if (usin->sin_family != AF_INET)
242 		return -EAFNOSUPPORT;
243 
244 	nexthop = daddr = usin->sin_addr.s_addr;
245 	inet_opt = rcu_dereference_protected(inet->inet_opt,
246 					     lockdep_sock_is_held(sk));
247 	if (inet_opt && inet_opt->opt.srr) {
248 		if (!daddr)
249 			return -EINVAL;
250 		nexthop = inet_opt->opt.faddr;
251 	}
252 
253 	orig_sport = inet->inet_sport;
254 	orig_dport = usin->sin_port;
255 	fl4 = &inet->cork.fl.u.ip4;
256 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
257 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
258 			      orig_dport, sk);
259 	if (IS_ERR(rt)) {
260 		err = PTR_ERR(rt);
261 		if (err == -ENETUNREACH)
262 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
263 		return err;
264 	}
265 
266 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
267 		ip_rt_put(rt);
268 		return -ENETUNREACH;
269 	}
270 
271 	if (!inet_opt || !inet_opt->opt.srr)
272 		daddr = fl4->daddr;
273 
274 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
275 
276 	if (!inet->inet_saddr) {
277 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
278 		if (err) {
279 			ip_rt_put(rt);
280 			return err;
281 		}
282 	} else {
283 		sk_rcv_saddr_set(sk, inet->inet_saddr);
284 	}
285 
286 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
287 		/* Reset inherited state */
288 		tp->rx_opt.ts_recent	   = 0;
289 		tp->rx_opt.ts_recent_stamp = 0;
290 		if (likely(!tp->repair))
291 			WRITE_ONCE(tp->write_seq, 0);
292 	}
293 
294 	inet->inet_dport = usin->sin_port;
295 	sk_daddr_set(sk, daddr);
296 
297 	inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
298 	if (inet_opt)
299 		inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
300 
301 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
302 
303 	/* Socket identity is still unknown (sport may be zero).
304 	 * However we set state to SYN-SENT and not releasing socket
305 	 * lock select source port, enter ourselves into the hash tables and
306 	 * complete initialization after this.
307 	 */
308 	tcp_set_state(sk, TCP_SYN_SENT);
309 	err = inet_hash_connect(tcp_death_row, sk);
310 	if (err)
311 		goto failure;
312 
313 	sk_set_txhash(sk);
314 
315 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
316 			       inet->inet_sport, inet->inet_dport, sk);
317 	if (IS_ERR(rt)) {
318 		err = PTR_ERR(rt);
319 		rt = NULL;
320 		goto failure;
321 	}
322 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
323 	/* OK, now commit destination to socket.  */
324 	sk->sk_gso_type = SKB_GSO_TCPV4;
325 	sk_setup_caps(sk, &rt->dst);
326 	rt = NULL;
327 
328 	if (likely(!tp->repair)) {
329 		if (!tp->write_seq)
330 			WRITE_ONCE(tp->write_seq,
331 				   secure_tcp_seq(inet->inet_saddr,
332 						  inet->inet_daddr,
333 						  inet->inet_sport,
334 						  usin->sin_port));
335 		WRITE_ONCE(tp->tsoffset,
336 			   secure_tcp_ts_off(net, inet->inet_saddr,
337 					     inet->inet_daddr));
338 	}
339 
340 	atomic_set(&inet->inet_id, get_random_u16());
341 
342 	if (tcp_fastopen_defer_connect(sk, &err))
343 		return err;
344 	if (err)
345 		goto failure;
346 
347 	err = tcp_connect(sk);
348 
349 	if (err)
350 		goto failure;
351 
352 	return 0;
353 
354 failure:
355 	/*
356 	 * This unhashes the socket and releases the local port,
357 	 * if necessary.
358 	 */
359 	tcp_set_state(sk, TCP_CLOSE);
360 	inet_bhash2_reset_saddr(sk);
361 	ip_rt_put(rt);
362 	sk->sk_route_caps = 0;
363 	inet->inet_dport = 0;
364 	return err;
365 }
366 EXPORT_IPV6_MOD(tcp_v4_connect);
367 
368 /*
369  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
370  * It can be called through tcp_release_cb() if socket was owned by user
371  * at the time tcp_v4_err() was called to handle ICMP message.
372  */
373 void tcp_v4_mtu_reduced(struct sock *sk)
374 {
375 	struct inet_sock *inet = inet_sk(sk);
376 	struct dst_entry *dst;
377 	u32 mtu;
378 
379 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
380 		return;
381 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
382 	dst = inet_csk_update_pmtu(sk, mtu);
383 	if (!dst)
384 		return;
385 
386 	/* Something is about to be wrong... Remember soft error
387 	 * for the case, if this connection will not able to recover.
388 	 */
389 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
390 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
391 
392 	mtu = dst_mtu(dst);
393 
394 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
395 	    ip_sk_accept_pmtu(sk) &&
396 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
397 		tcp_sync_mss(sk, mtu);
398 
399 		/* Resend the TCP packet because it's
400 		 * clear that the old packet has been
401 		 * dropped. This is the new "fast" path mtu
402 		 * discovery.
403 		 */
404 		tcp_simple_retransmit(sk);
405 	} /* else let the usual retransmit timer handle it */
406 }
407 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
408 
409 static void do_redirect(struct sk_buff *skb, struct sock *sk)
410 {
411 	struct dst_entry *dst = __sk_dst_check(sk, 0);
412 
413 	if (dst)
414 		dst->ops->redirect(dst, sk, skb);
415 }
416 
417 
418 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
419 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
420 {
421 	struct request_sock *req = inet_reqsk(sk);
422 	struct net *net = sock_net(sk);
423 
424 	/* ICMPs are not backlogged, hence we cannot get
425 	 * an established socket here.
426 	 */
427 	if (seq != tcp_rsk(req)->snt_isn) {
428 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
429 	} else if (abort) {
430 		/*
431 		 * Still in SYN_RECV, just remove it silently.
432 		 * There is no good way to pass the error to the newly
433 		 * created socket, and POSIX does not want network
434 		 * errors returned from accept().
435 		 */
436 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
437 		tcp_listendrop(req->rsk_listener);
438 	}
439 	reqsk_put(req);
440 }
441 EXPORT_IPV6_MOD(tcp_req_err);
442 
443 /* TCP-LD (RFC 6069) logic */
444 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
445 {
446 	struct inet_connection_sock *icsk = inet_csk(sk);
447 	struct tcp_sock *tp = tcp_sk(sk);
448 	struct sk_buff *skb;
449 	s32 remaining;
450 	u32 delta_us;
451 
452 	if (sock_owned_by_user(sk))
453 		return;
454 
455 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
456 	    !icsk->icsk_backoff)
457 		return;
458 
459 	skb = tcp_rtx_queue_head(sk);
460 	if (WARN_ON_ONCE(!skb))
461 		return;
462 
463 	icsk->icsk_backoff--;
464 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
465 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
466 
467 	tcp_mstamp_refresh(tp);
468 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
469 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
470 
471 	if (remaining > 0) {
472 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
473 	} else {
474 		/* RTO revert clocked out retransmission.
475 		 * Will retransmit now.
476 		 */
477 		tcp_retransmit_timer(sk);
478 	}
479 }
480 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
481 
482 /*
483  * This routine is called by the ICMP module when it gets some
484  * sort of error condition.  If err < 0 then the socket should
485  * be closed and the error returned to the user.  If err > 0
486  * it's just the icmp type << 8 | icmp code.  After adjustment
487  * header points to the first 8 bytes of the tcp header.  We need
488  * to find the appropriate port.
489  *
490  * The locking strategy used here is very "optimistic". When
491  * someone else accesses the socket the ICMP is just dropped
492  * and for some paths there is no check at all.
493  * A more general error queue to queue errors for later handling
494  * is probably better.
495  *
496  */
497 
498 int tcp_v4_err(struct sk_buff *skb, u32 info)
499 {
500 	const struct iphdr *iph = (const struct iphdr *)skb->data;
501 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
502 	struct net *net = dev_net_rcu(skb->dev);
503 	const int type = icmp_hdr(skb)->type;
504 	const int code = icmp_hdr(skb)->code;
505 	struct request_sock *fastopen;
506 	struct tcp_sock *tp;
507 	u32 seq, snd_una;
508 	struct sock *sk;
509 	int err;
510 
511 	sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
512 				       ntohs(th->source), inet_iif(skb), 0);
513 	if (!sk) {
514 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
515 		return -ENOENT;
516 	}
517 	if (sk->sk_state == TCP_TIME_WAIT) {
518 		/* To increase the counter of ignored icmps for TCP-AO */
519 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
520 		inet_twsk_put(inet_twsk(sk));
521 		return 0;
522 	}
523 	seq = ntohl(th->seq);
524 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
525 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
526 				     type == ICMP_TIME_EXCEEDED ||
527 				     (type == ICMP_DEST_UNREACH &&
528 				      (code == ICMP_NET_UNREACH ||
529 				       code == ICMP_HOST_UNREACH)));
530 		return 0;
531 	}
532 
533 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
534 		sock_put(sk);
535 		return 0;
536 	}
537 
538 	bh_lock_sock(sk);
539 	/* If too many ICMPs get dropped on busy
540 	 * servers this needs to be solved differently.
541 	 * We do take care of PMTU discovery (RFC1191) special case :
542 	 * we can receive locally generated ICMP messages while socket is held.
543 	 */
544 	if (sock_owned_by_user(sk)) {
545 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
546 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
547 	}
548 	if (sk->sk_state == TCP_CLOSE)
549 		goto out;
550 
551 	if (static_branch_unlikely(&ip4_min_ttl)) {
552 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
553 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
554 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
555 			goto out;
556 		}
557 	}
558 
559 	tp = tcp_sk(sk);
560 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
561 	fastopen = rcu_dereference(tp->fastopen_rsk);
562 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
563 	if (sk->sk_state != TCP_LISTEN &&
564 	    !between(seq, snd_una, tp->snd_nxt)) {
565 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
566 		goto out;
567 	}
568 
569 	switch (type) {
570 	case ICMP_REDIRECT:
571 		if (!sock_owned_by_user(sk))
572 			do_redirect(skb, sk);
573 		goto out;
574 	case ICMP_SOURCE_QUENCH:
575 		/* Just silently ignore these. */
576 		goto out;
577 	case ICMP_PARAMETERPROB:
578 		err = EPROTO;
579 		break;
580 	case ICMP_DEST_UNREACH:
581 		if (code > NR_ICMP_UNREACH)
582 			goto out;
583 
584 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
585 			/* We are not interested in TCP_LISTEN and open_requests
586 			 * (SYN-ACKs send out by Linux are always <576bytes so
587 			 * they should go through unfragmented).
588 			 */
589 			if (sk->sk_state == TCP_LISTEN)
590 				goto out;
591 
592 			WRITE_ONCE(tp->mtu_info, info);
593 			if (!sock_owned_by_user(sk)) {
594 				tcp_v4_mtu_reduced(sk);
595 			} else {
596 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
597 					sock_hold(sk);
598 			}
599 			goto out;
600 		}
601 
602 		err = icmp_err_convert[code].errno;
603 		/* check if this ICMP message allows revert of backoff.
604 		 * (see RFC 6069)
605 		 */
606 		if (!fastopen &&
607 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
608 			tcp_ld_RTO_revert(sk, seq);
609 		break;
610 	case ICMP_TIME_EXCEEDED:
611 		err = EHOSTUNREACH;
612 		break;
613 	default:
614 		goto out;
615 	}
616 
617 	switch (sk->sk_state) {
618 	case TCP_SYN_SENT:
619 	case TCP_SYN_RECV:
620 		/* Only in fast or simultaneous open. If a fast open socket is
621 		 * already accepted it is treated as a connected one below.
622 		 */
623 		if (fastopen && !fastopen->sk)
624 			break;
625 
626 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
627 
628 		if (!sock_owned_by_user(sk))
629 			tcp_done_with_error(sk, err);
630 		else
631 			WRITE_ONCE(sk->sk_err_soft, err);
632 		goto out;
633 	}
634 
635 	/* If we've already connected we will keep trying
636 	 * until we time out, or the user gives up.
637 	 *
638 	 * rfc1122 4.2.3.9 allows to consider as hard errors
639 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
640 	 * but it is obsoleted by pmtu discovery).
641 	 *
642 	 * Note, that in modern internet, where routing is unreliable
643 	 * and in each dark corner broken firewalls sit, sending random
644 	 * errors ordered by their masters even this two messages finally lose
645 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
646 	 *
647 	 * Now we are in compliance with RFCs.
648 	 *							--ANK (980905)
649 	 */
650 
651 	if (!sock_owned_by_user(sk) &&
652 	    inet_test_bit(RECVERR, sk)) {
653 		WRITE_ONCE(sk->sk_err, err);
654 		sk_error_report(sk);
655 	} else	{ /* Only an error on timeout */
656 		WRITE_ONCE(sk->sk_err_soft, err);
657 	}
658 
659 out:
660 	bh_unlock_sock(sk);
661 	sock_put(sk);
662 	return 0;
663 }
664 
665 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
666 {
667 	struct tcphdr *th = tcp_hdr(skb);
668 
669 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
670 	skb->csum_start = skb_transport_header(skb) - skb->head;
671 	skb->csum_offset = offsetof(struct tcphdr, check);
672 }
673 
674 /* This routine computes an IPv4 TCP checksum. */
675 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
676 {
677 	const struct inet_sock *inet = inet_sk(sk);
678 
679 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
680 }
681 EXPORT_IPV6_MOD(tcp_v4_send_check);
682 
683 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
684 
685 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
686 				 const struct tcp_ao_hdr *aoh,
687 				 struct ip_reply_arg *arg, struct tcphdr *reply,
688 				 __be32 reply_options[REPLY_OPTIONS_LEN])
689 {
690 #ifdef CONFIG_TCP_AO
691 	int sdif = tcp_v4_sdif(skb);
692 	int dif = inet_iif(skb);
693 	int l3index = sdif ? dif : 0;
694 	bool allocated_traffic_key;
695 	struct tcp_ao_key *key;
696 	char *traffic_key;
697 	bool drop = true;
698 	u32 ao_sne = 0;
699 	u8 keyid;
700 
701 	rcu_read_lock();
702 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
703 				 &key, &traffic_key, &allocated_traffic_key,
704 				 &keyid, &ao_sne))
705 		goto out;
706 
707 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
708 				 (aoh->rnext_keyid << 8) | keyid);
709 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
710 	reply->doff = arg->iov[0].iov_len / 4;
711 
712 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
713 			    key, traffic_key,
714 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
715 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
716 			    reply, ao_sne))
717 		goto out;
718 	drop = false;
719 out:
720 	rcu_read_unlock();
721 	if (allocated_traffic_key)
722 		kfree(traffic_key);
723 	return drop;
724 #else
725 	return true;
726 #endif
727 }
728 
729 /*
730  *	This routine will send an RST to the other tcp.
731  *
732  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
733  *		      for reset.
734  *	Answer: if a packet caused RST, it is not for a socket
735  *		existing in our system, if it is matched to a socket,
736  *		it is just duplicate segment or bug in other side's TCP.
737  *		So that we build reply only basing on parameters
738  *		arrived with segment.
739  *	Exception: precedence violation. We do not implement it in any case.
740  */
741 
742 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
743 			      enum sk_rst_reason reason)
744 {
745 	const struct tcphdr *th = tcp_hdr(skb);
746 	struct {
747 		struct tcphdr th;
748 		__be32 opt[REPLY_OPTIONS_LEN];
749 	} rep;
750 	const __u8 *md5_hash_location = NULL;
751 	const struct tcp_ao_hdr *aoh;
752 	struct ip_reply_arg arg;
753 #ifdef CONFIG_TCP_MD5SIG
754 	struct tcp_md5sig_key *key = NULL;
755 	unsigned char newhash[16];
756 	struct sock *sk1 = NULL;
757 	int genhash;
758 #endif
759 	u64 transmit_time = 0;
760 	struct sock *ctl_sk;
761 	struct net *net;
762 	u32 txhash = 0;
763 
764 	/* Never send a reset in response to a reset. */
765 	if (th->rst)
766 		return;
767 
768 	/* If sk not NULL, it means we did a successful lookup and incoming
769 	 * route had to be correct. prequeue might have dropped our dst.
770 	 */
771 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
772 		return;
773 
774 	/* Swap the send and the receive. */
775 	memset(&rep, 0, sizeof(rep));
776 	rep.th.dest   = th->source;
777 	rep.th.source = th->dest;
778 	rep.th.doff   = sizeof(struct tcphdr) / 4;
779 	rep.th.rst    = 1;
780 
781 	if (th->ack) {
782 		rep.th.seq = th->ack_seq;
783 	} else {
784 		rep.th.ack = 1;
785 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
786 				       skb->len - (th->doff << 2));
787 	}
788 
789 	memset(&arg, 0, sizeof(arg));
790 	arg.iov[0].iov_base = (unsigned char *)&rep;
791 	arg.iov[0].iov_len  = sizeof(rep.th);
792 
793 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
794 
795 	/* Invalid TCP option size or twice included auth */
796 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
797 		return;
798 
799 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
800 		return;
801 
802 #ifdef CONFIG_TCP_MD5SIG
803 	rcu_read_lock();
804 	if (sk && sk_fullsock(sk)) {
805 		const union tcp_md5_addr *addr;
806 		int l3index;
807 
808 		/* sdif set, means packet ingressed via a device
809 		 * in an L3 domain and inet_iif is set to it.
810 		 */
811 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
812 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
813 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
814 	} else if (md5_hash_location) {
815 		const union tcp_md5_addr *addr;
816 		int sdif = tcp_v4_sdif(skb);
817 		int dif = inet_iif(skb);
818 		int l3index;
819 
820 		/*
821 		 * active side is lost. Try to find listening socket through
822 		 * source port, and then find md5 key through listening socket.
823 		 * we are not loose security here:
824 		 * Incoming packet is checked with md5 hash with finding key,
825 		 * no RST generated if md5 hash doesn't match.
826 		 */
827 		sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
828 					     th->source, ip_hdr(skb)->daddr,
829 					     ntohs(th->source), dif, sdif);
830 		/* don't send rst if it can't find key */
831 		if (!sk1)
832 			goto out;
833 
834 		/* sdif set, means packet ingressed via a device
835 		 * in an L3 domain and dif is set to it.
836 		 */
837 		l3index = sdif ? dif : 0;
838 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
839 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
840 		if (!key)
841 			goto out;
842 
843 
844 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
845 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
846 			goto out;
847 
848 	}
849 
850 	if (key) {
851 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
852 				   (TCPOPT_NOP << 16) |
853 				   (TCPOPT_MD5SIG << 8) |
854 				   TCPOLEN_MD5SIG);
855 		/* Update length and the length the header thinks exists */
856 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
857 		rep.th.doff = arg.iov[0].iov_len / 4;
858 
859 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
860 				     key, ip_hdr(skb)->saddr,
861 				     ip_hdr(skb)->daddr, &rep.th);
862 	}
863 #endif
864 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
865 	if (rep.opt[0] == 0) {
866 		__be32 mrst = mptcp_reset_option(skb);
867 
868 		if (mrst) {
869 			rep.opt[0] = mrst;
870 			arg.iov[0].iov_len += sizeof(mrst);
871 			rep.th.doff = arg.iov[0].iov_len / 4;
872 		}
873 	}
874 
875 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
876 				      ip_hdr(skb)->saddr, /* XXX */
877 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
878 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
879 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
880 
881 	/* When socket is gone, all binding information is lost.
882 	 * routing might fail in this case. No choice here, if we choose to force
883 	 * input interface, we will misroute in case of asymmetric route.
884 	 */
885 	if (sk)
886 		arg.bound_dev_if = sk->sk_bound_dev_if;
887 
888 	trace_tcp_send_reset(sk, skb, reason);
889 
890 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
891 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
892 
893 	/* ECN bits of TW reset are cleared */
894 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
895 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
896 	local_bh_disable();
897 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
898 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
899 
900 	sock_net_set(ctl_sk, net);
901 	if (sk) {
902 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
903 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
904 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
905 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
906 		transmit_time = tcp_transmit_time(sk);
907 		xfrm_sk_clone_policy(ctl_sk, sk);
908 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
909 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
910 	} else {
911 		ctl_sk->sk_mark = 0;
912 		ctl_sk->sk_priority = 0;
913 	}
914 	ip_send_unicast_reply(ctl_sk, sk,
915 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
916 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917 			      &arg, arg.iov[0].iov_len,
918 			      transmit_time, txhash);
919 
920 	xfrm_sk_free_policy(ctl_sk);
921 	sock_net_set(ctl_sk, &init_net);
922 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
923 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
924 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
925 	local_bh_enable();
926 
927 #ifdef CONFIG_TCP_MD5SIG
928 out:
929 	rcu_read_unlock();
930 #endif
931 }
932 
933 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
934    outside socket context is ugly, certainly. What can I do?
935  */
936 
937 static void tcp_v4_send_ack(const struct sock *sk,
938 			    struct sk_buff *skb, u32 seq, u32 ack,
939 			    u32 win, u32 tsval, u32 tsecr, int oif,
940 			    struct tcp_key *key,
941 			    int reply_flags, u8 tos, u32 txhash)
942 {
943 	const struct tcphdr *th = tcp_hdr(skb);
944 	struct {
945 		struct tcphdr th;
946 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
947 	} rep;
948 	struct net *net = sock_net(sk);
949 	struct ip_reply_arg arg;
950 	struct sock *ctl_sk;
951 	u64 transmit_time;
952 
953 	memset(&rep.th, 0, sizeof(struct tcphdr));
954 	memset(&arg, 0, sizeof(arg));
955 
956 	arg.iov[0].iov_base = (unsigned char *)&rep;
957 	arg.iov[0].iov_len  = sizeof(rep.th);
958 	if (tsecr) {
959 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
960 				   (TCPOPT_TIMESTAMP << 8) |
961 				   TCPOLEN_TIMESTAMP);
962 		rep.opt[1] = htonl(tsval);
963 		rep.opt[2] = htonl(tsecr);
964 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
965 	}
966 
967 	/* Swap the send and the receive. */
968 	rep.th.dest    = th->source;
969 	rep.th.source  = th->dest;
970 	rep.th.doff    = arg.iov[0].iov_len / 4;
971 	rep.th.seq     = htonl(seq);
972 	rep.th.ack_seq = htonl(ack);
973 	rep.th.ack     = 1;
974 	rep.th.window  = htons(win);
975 
976 #ifdef CONFIG_TCP_MD5SIG
977 	if (tcp_key_is_md5(key)) {
978 		int offset = (tsecr) ? 3 : 0;
979 
980 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
981 					  (TCPOPT_NOP << 16) |
982 					  (TCPOPT_MD5SIG << 8) |
983 					  TCPOLEN_MD5SIG);
984 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
985 		rep.th.doff = arg.iov[0].iov_len/4;
986 
987 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
988 				    key->md5_key, ip_hdr(skb)->saddr,
989 				    ip_hdr(skb)->daddr, &rep.th);
990 	}
991 #endif
992 #ifdef CONFIG_TCP_AO
993 	if (tcp_key_is_ao(key)) {
994 		int offset = (tsecr) ? 3 : 0;
995 
996 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
997 					  (tcp_ao_len(key->ao_key) << 16) |
998 					  (key->ao_key->sndid << 8) |
999 					  key->rcv_next);
1000 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
1001 		rep.th.doff = arg.iov[0].iov_len / 4;
1002 
1003 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1004 				key->ao_key, key->traffic_key,
1005 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1006 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1007 				&rep.th, key->sne);
1008 	}
1009 #endif
1010 	arg.flags = reply_flags;
1011 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1012 				      ip_hdr(skb)->saddr, /* XXX */
1013 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1014 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1015 	if (oif)
1016 		arg.bound_dev_if = oif;
1017 	arg.tos = tos;
1018 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1019 	local_bh_disable();
1020 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1021 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1022 	sock_net_set(ctl_sk, net);
1023 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1024 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1025 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1026 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1027 	transmit_time = tcp_transmit_time(sk);
1028 	ip_send_unicast_reply(ctl_sk, sk,
1029 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1030 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1031 			      &arg, arg.iov[0].iov_len,
1032 			      transmit_time, txhash);
1033 
1034 	sock_net_set(ctl_sk, &init_net);
1035 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1036 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1037 	local_bh_enable();
1038 }
1039 
1040 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1041 				enum tcp_tw_status tw_status)
1042 {
1043 	struct inet_timewait_sock *tw = inet_twsk(sk);
1044 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1045 	struct tcp_key key = {};
1046 	u8 tos = tw->tw_tos;
1047 
1048 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1049 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1050 	 * being placed in a different service queues (Classic rather than L4S)
1051 	 */
1052 	if (tw_status == TCP_TW_ACK_OOW)
1053 		tos &= ~INET_ECN_MASK;
1054 
1055 #ifdef CONFIG_TCP_AO
1056 	struct tcp_ao_info *ao_info;
1057 
1058 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1059 		/* FIXME: the segment to-be-acked is not verified yet */
1060 		ao_info = rcu_dereference(tcptw->ao_info);
1061 		if (ao_info) {
1062 			const struct tcp_ao_hdr *aoh;
1063 
1064 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1065 				inet_twsk_put(tw);
1066 				return;
1067 			}
1068 
1069 			if (aoh)
1070 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1071 								    aoh->rnext_keyid, -1);
1072 		}
1073 	}
1074 	if (key.ao_key) {
1075 		struct tcp_ao_key *rnext_key;
1076 
1077 		key.traffic_key = snd_other_key(key.ao_key);
1078 		key.sne = READ_ONCE(ao_info->snd_sne);
1079 		rnext_key = READ_ONCE(ao_info->rnext_key);
1080 		key.rcv_next = rnext_key->rcvid;
1081 		key.type = TCP_KEY_AO;
1082 #else
1083 	if (0) {
1084 #endif
1085 	} else if (static_branch_tcp_md5()) {
1086 		key.md5_key = tcp_twsk_md5_key(tcptw);
1087 		if (key.md5_key)
1088 			key.type = TCP_KEY_MD5;
1089 	}
1090 
1091 	tcp_v4_send_ack(sk, skb,
1092 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1093 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1094 			tcp_tw_tsval(tcptw),
1095 			READ_ONCE(tcptw->tw_ts_recent),
1096 			tw->tw_bound_dev_if, &key,
1097 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1098 			tos,
1099 			tw->tw_txhash);
1100 
1101 	inet_twsk_put(tw);
1102 }
1103 
1104 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1105 				  struct request_sock *req)
1106 {
1107 	struct tcp_key key = {};
1108 
1109 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1110 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1111 	 */
1112 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1113 					     tcp_sk(sk)->snd_nxt;
1114 
1115 #ifdef CONFIG_TCP_AO
1116 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1117 	    tcp_rsk_used_ao(req)) {
1118 		const union tcp_md5_addr *addr;
1119 		const struct tcp_ao_hdr *aoh;
1120 		int l3index;
1121 
1122 		/* Invalid TCP option size or twice included auth */
1123 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1124 			return;
1125 		if (!aoh)
1126 			return;
1127 
1128 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1129 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1130 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1131 					      aoh->rnext_keyid, -1);
1132 		if (unlikely(!key.ao_key)) {
1133 			/* Send ACK with any matching MKT for the peer */
1134 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1135 			/* Matching key disappeared (user removed the key?)
1136 			 * let the handshake timeout.
1137 			 */
1138 			if (!key.ao_key) {
1139 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1140 						     addr,
1141 						     ntohs(tcp_hdr(skb)->source),
1142 						     &ip_hdr(skb)->daddr,
1143 						     ntohs(tcp_hdr(skb)->dest));
1144 				return;
1145 			}
1146 		}
1147 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1148 		if (!key.traffic_key)
1149 			return;
1150 
1151 		key.type = TCP_KEY_AO;
1152 		key.rcv_next = aoh->keyid;
1153 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1154 #else
1155 	if (0) {
1156 #endif
1157 	} else if (static_branch_tcp_md5()) {
1158 		const union tcp_md5_addr *addr;
1159 		int l3index;
1160 
1161 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1162 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1163 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1164 		if (key.md5_key)
1165 			key.type = TCP_KEY_MD5;
1166 	}
1167 
1168 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1169 	tcp_v4_send_ack(sk, skb, seq,
1170 			tcp_rsk(req)->rcv_nxt,
1171 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1172 			tcp_rsk_tsval(tcp_rsk(req)),
1173 			req->ts_recent,
1174 			0, &key,
1175 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1176 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1177 			READ_ONCE(tcp_rsk(req)->txhash));
1178 	if (tcp_key_is_ao(&key))
1179 		kfree(key.traffic_key);
1180 }
1181 
1182 /*
1183  *	Send a SYN-ACK after having received a SYN.
1184  *	This still operates on a request_sock only, not on a big
1185  *	socket.
1186  */
1187 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1188 			      struct flowi *fl,
1189 			      struct request_sock *req,
1190 			      struct tcp_fastopen_cookie *foc,
1191 			      enum tcp_synack_type synack_type,
1192 			      struct sk_buff *syn_skb)
1193 {
1194 	struct inet_request_sock *ireq = inet_rsk(req);
1195 	struct flowi4 fl4;
1196 	int err = -1;
1197 	struct sk_buff *skb;
1198 	u8 tos;
1199 
1200 	/* First, grab a route. */
1201 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1202 		return -1;
1203 
1204 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1205 
1206 	if (skb) {
1207 		tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1208 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1209 
1210 		tos = READ_ONCE(inet_sk(sk)->tos);
1211 
1212 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1213 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1214 			      (tos & INET_ECN_MASK);
1215 
1216 		if (!INET_ECN_is_capable(tos) &&
1217 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1218 			tos |= INET_ECN_ECT_0;
1219 
1220 		rcu_read_lock();
1221 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1222 					    ireq->ir_rmt_addr,
1223 					    rcu_dereference(ireq->ireq_opt),
1224 					    tos);
1225 		rcu_read_unlock();
1226 		err = net_xmit_eval(err);
1227 	}
1228 
1229 	return err;
1230 }
1231 
1232 /*
1233  *	IPv4 request_sock destructor.
1234  */
1235 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1236 {
1237 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1238 }
1239 
1240 #ifdef CONFIG_TCP_MD5SIG
1241 /*
1242  * RFC2385 MD5 checksumming requires a mapping of
1243  * IP address->MD5 Key.
1244  * We need to maintain these in the sk structure.
1245  */
1246 
1247 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1248 EXPORT_IPV6_MOD(tcp_md5_needed);
1249 
1250 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1251 {
1252 	if (!old)
1253 		return true;
1254 
1255 	/* l3index always overrides non-l3index */
1256 	if (old->l3index && new->l3index == 0)
1257 		return false;
1258 	if (old->l3index == 0 && new->l3index)
1259 		return true;
1260 
1261 	return old->prefixlen < new->prefixlen;
1262 }
1263 
1264 /* Find the Key structure for an address.  */
1265 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1266 					   const union tcp_md5_addr *addr,
1267 					   int family, bool any_l3index)
1268 {
1269 	const struct tcp_sock *tp = tcp_sk(sk);
1270 	struct tcp_md5sig_key *key;
1271 	const struct tcp_md5sig_info *md5sig;
1272 	__be32 mask;
1273 	struct tcp_md5sig_key *best_match = NULL;
1274 	bool match;
1275 
1276 	/* caller either holds rcu_read_lock() or socket lock */
1277 	md5sig = rcu_dereference_check(tp->md5sig_info,
1278 				       lockdep_sock_is_held(sk));
1279 	if (!md5sig)
1280 		return NULL;
1281 
1282 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1283 				 lockdep_sock_is_held(sk)) {
1284 		if (key->family != family)
1285 			continue;
1286 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1287 		    key->l3index != l3index)
1288 			continue;
1289 		if (family == AF_INET) {
1290 			mask = inet_make_mask(key->prefixlen);
1291 			match = (key->addr.a4.s_addr & mask) ==
1292 				(addr->a4.s_addr & mask);
1293 #if IS_ENABLED(CONFIG_IPV6)
1294 		} else if (family == AF_INET6) {
1295 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1296 						  key->prefixlen);
1297 #endif
1298 		} else {
1299 			match = false;
1300 		}
1301 
1302 		if (match && better_md5_match(best_match, key))
1303 			best_match = key;
1304 	}
1305 	return best_match;
1306 }
1307 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1308 
1309 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1310 						      const union tcp_md5_addr *addr,
1311 						      int family, u8 prefixlen,
1312 						      int l3index, u8 flags)
1313 {
1314 	const struct tcp_sock *tp = tcp_sk(sk);
1315 	struct tcp_md5sig_key *key;
1316 	unsigned int size = sizeof(struct in_addr);
1317 	const struct tcp_md5sig_info *md5sig;
1318 
1319 	/* caller either holds rcu_read_lock() or socket lock */
1320 	md5sig = rcu_dereference_check(tp->md5sig_info,
1321 				       lockdep_sock_is_held(sk));
1322 	if (!md5sig)
1323 		return NULL;
1324 #if IS_ENABLED(CONFIG_IPV6)
1325 	if (family == AF_INET6)
1326 		size = sizeof(struct in6_addr);
1327 #endif
1328 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1329 				 lockdep_sock_is_held(sk)) {
1330 		if (key->family != family)
1331 			continue;
1332 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1333 			continue;
1334 		if (key->l3index != l3index)
1335 			continue;
1336 		if (!memcmp(&key->addr, addr, size) &&
1337 		    key->prefixlen == prefixlen)
1338 			return key;
1339 	}
1340 	return NULL;
1341 }
1342 
1343 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1344 					 const struct sock *addr_sk)
1345 {
1346 	const union tcp_md5_addr *addr;
1347 	int l3index;
1348 
1349 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1350 						 addr_sk->sk_bound_dev_if);
1351 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1352 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1353 }
1354 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1355 
1356 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1357 {
1358 	struct tcp_sock *tp = tcp_sk(sk);
1359 	struct tcp_md5sig_info *md5sig;
1360 
1361 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1362 	if (!md5sig)
1363 		return -ENOMEM;
1364 
1365 	sk_gso_disable(sk);
1366 	INIT_HLIST_HEAD(&md5sig->head);
1367 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1368 	return 0;
1369 }
1370 
1371 /* This can be called on a newly created socket, from other files */
1372 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1373 			    int family, u8 prefixlen, int l3index, u8 flags,
1374 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1375 {
1376 	/* Add Key to the list */
1377 	struct tcp_md5sig_key *key;
1378 	struct tcp_sock *tp = tcp_sk(sk);
1379 	struct tcp_md5sig_info *md5sig;
1380 
1381 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1382 	if (key) {
1383 		/* Pre-existing entry - just update that one.
1384 		 * Note that the key might be used concurrently.
1385 		 * data_race() is telling kcsan that we do not care of
1386 		 * key mismatches, since changing MD5 key on live flows
1387 		 * can lead to packet drops.
1388 		 */
1389 		data_race(memcpy(key->key, newkey, newkeylen));
1390 
1391 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1392 		 * Also note that a reader could catch new key->keylen value
1393 		 * but old key->key[], this is the reason we use __GFP_ZERO
1394 		 * at sock_kmalloc() time below these lines.
1395 		 */
1396 		WRITE_ONCE(key->keylen, newkeylen);
1397 
1398 		return 0;
1399 	}
1400 
1401 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1402 					   lockdep_sock_is_held(sk));
1403 
1404 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1405 	if (!key)
1406 		return -ENOMEM;
1407 
1408 	memcpy(key->key, newkey, newkeylen);
1409 	key->keylen = newkeylen;
1410 	key->family = family;
1411 	key->prefixlen = prefixlen;
1412 	key->l3index = l3index;
1413 	key->flags = flags;
1414 	memcpy(&key->addr, addr,
1415 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1416 								 sizeof(struct in_addr));
1417 	hlist_add_head_rcu(&key->node, &md5sig->head);
1418 	return 0;
1419 }
1420 
1421 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1422 		   int family, u8 prefixlen, int l3index, u8 flags,
1423 		   const u8 *newkey, u8 newkeylen)
1424 {
1425 	struct tcp_sock *tp = tcp_sk(sk);
1426 
1427 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1428 		if (tcp_md5_alloc_sigpool())
1429 			return -ENOMEM;
1430 
1431 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1432 			tcp_md5_release_sigpool();
1433 			return -ENOMEM;
1434 		}
1435 
1436 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1437 			struct tcp_md5sig_info *md5sig;
1438 
1439 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1440 			rcu_assign_pointer(tp->md5sig_info, NULL);
1441 			kfree_rcu(md5sig, rcu);
1442 			tcp_md5_release_sigpool();
1443 			return -EUSERS;
1444 		}
1445 	}
1446 
1447 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1448 				newkey, newkeylen, GFP_KERNEL);
1449 }
1450 EXPORT_IPV6_MOD(tcp_md5_do_add);
1451 
1452 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1453 		     int family, u8 prefixlen, int l3index,
1454 		     struct tcp_md5sig_key *key)
1455 {
1456 	struct tcp_sock *tp = tcp_sk(sk);
1457 
1458 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1459 		tcp_md5_add_sigpool();
1460 
1461 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1462 			tcp_md5_release_sigpool();
1463 			return -ENOMEM;
1464 		}
1465 
1466 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1467 			struct tcp_md5sig_info *md5sig;
1468 
1469 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1470 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1471 			rcu_assign_pointer(tp->md5sig_info, NULL);
1472 			kfree_rcu(md5sig, rcu);
1473 			tcp_md5_release_sigpool();
1474 			return -EUSERS;
1475 		}
1476 	}
1477 
1478 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1479 				key->flags, key->key, key->keylen,
1480 				sk_gfp_mask(sk, GFP_ATOMIC));
1481 }
1482 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1483 
1484 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1485 		   u8 prefixlen, int l3index, u8 flags)
1486 {
1487 	struct tcp_md5sig_key *key;
1488 
1489 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1490 	if (!key)
1491 		return -ENOENT;
1492 	hlist_del_rcu(&key->node);
1493 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1494 	kfree_rcu(key, rcu);
1495 	return 0;
1496 }
1497 EXPORT_IPV6_MOD(tcp_md5_do_del);
1498 
1499 void tcp_clear_md5_list(struct sock *sk)
1500 {
1501 	struct tcp_sock *tp = tcp_sk(sk);
1502 	struct tcp_md5sig_key *key;
1503 	struct hlist_node *n;
1504 	struct tcp_md5sig_info *md5sig;
1505 
1506 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1507 
1508 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1509 		hlist_del(&key->node);
1510 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1511 		kfree(key);
1512 	}
1513 }
1514 
1515 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1516 				 sockptr_t optval, int optlen)
1517 {
1518 	struct tcp_md5sig cmd;
1519 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1520 	const union tcp_md5_addr *addr;
1521 	u8 prefixlen = 32;
1522 	int l3index = 0;
1523 	bool l3flag;
1524 	u8 flags;
1525 
1526 	if (optlen < sizeof(cmd))
1527 		return -EINVAL;
1528 
1529 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1530 		return -EFAULT;
1531 
1532 	if (sin->sin_family != AF_INET)
1533 		return -EINVAL;
1534 
1535 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1536 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1537 
1538 	if (optname == TCP_MD5SIG_EXT &&
1539 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1540 		prefixlen = cmd.tcpm_prefixlen;
1541 		if (prefixlen > 32)
1542 			return -EINVAL;
1543 	}
1544 
1545 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1546 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1547 		struct net_device *dev;
1548 
1549 		rcu_read_lock();
1550 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1551 		if (dev && netif_is_l3_master(dev))
1552 			l3index = dev->ifindex;
1553 
1554 		rcu_read_unlock();
1555 
1556 		/* ok to reference set/not set outside of rcu;
1557 		 * right now device MUST be an L3 master
1558 		 */
1559 		if (!dev || !l3index)
1560 			return -EINVAL;
1561 	}
1562 
1563 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1564 
1565 	if (!cmd.tcpm_keylen)
1566 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1567 
1568 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1569 		return -EINVAL;
1570 
1571 	/* Don't allow keys for peers that have a matching TCP-AO key.
1572 	 * See the comment in tcp_ao_add_cmd()
1573 	 */
1574 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1575 		return -EKEYREJECTED;
1576 
1577 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1578 			      cmd.tcpm_key, cmd.tcpm_keylen);
1579 }
1580 
1581 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1582 				   __be32 daddr, __be32 saddr,
1583 				   const struct tcphdr *th, int nbytes)
1584 {
1585 	struct tcp4_pseudohdr *bp;
1586 	struct scatterlist sg;
1587 	struct tcphdr *_th;
1588 
1589 	bp = hp->scratch;
1590 	bp->saddr = saddr;
1591 	bp->daddr = daddr;
1592 	bp->pad = 0;
1593 	bp->protocol = IPPROTO_TCP;
1594 	bp->len = cpu_to_be16(nbytes);
1595 
1596 	_th = (struct tcphdr *)(bp + 1);
1597 	memcpy(_th, th, sizeof(*th));
1598 	_th->check = 0;
1599 
1600 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1601 	ahash_request_set_crypt(hp->req, &sg, NULL,
1602 				sizeof(*bp) + sizeof(*th));
1603 	return crypto_ahash_update(hp->req);
1604 }
1605 
1606 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1607 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1608 {
1609 	struct tcp_sigpool hp;
1610 
1611 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1612 		goto clear_hash_nostart;
1613 
1614 	if (crypto_ahash_init(hp.req))
1615 		goto clear_hash;
1616 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1617 		goto clear_hash;
1618 	if (tcp_md5_hash_key(&hp, key))
1619 		goto clear_hash;
1620 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1621 	if (crypto_ahash_final(hp.req))
1622 		goto clear_hash;
1623 
1624 	tcp_sigpool_end(&hp);
1625 	return 0;
1626 
1627 clear_hash:
1628 	tcp_sigpool_end(&hp);
1629 clear_hash_nostart:
1630 	memset(md5_hash, 0, 16);
1631 	return 1;
1632 }
1633 
1634 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1635 			const struct sock *sk,
1636 			const struct sk_buff *skb)
1637 {
1638 	const struct tcphdr *th = tcp_hdr(skb);
1639 	struct tcp_sigpool hp;
1640 	__be32 saddr, daddr;
1641 
1642 	if (sk) { /* valid for establish/request sockets */
1643 		saddr = sk->sk_rcv_saddr;
1644 		daddr = sk->sk_daddr;
1645 	} else {
1646 		const struct iphdr *iph = ip_hdr(skb);
1647 		saddr = iph->saddr;
1648 		daddr = iph->daddr;
1649 	}
1650 
1651 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1652 		goto clear_hash_nostart;
1653 
1654 	if (crypto_ahash_init(hp.req))
1655 		goto clear_hash;
1656 
1657 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1658 		goto clear_hash;
1659 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1660 		goto clear_hash;
1661 	if (tcp_md5_hash_key(&hp, key))
1662 		goto clear_hash;
1663 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1664 	if (crypto_ahash_final(hp.req))
1665 		goto clear_hash;
1666 
1667 	tcp_sigpool_end(&hp);
1668 	return 0;
1669 
1670 clear_hash:
1671 	tcp_sigpool_end(&hp);
1672 clear_hash_nostart:
1673 	memset(md5_hash, 0, 16);
1674 	return 1;
1675 }
1676 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1677 
1678 #endif
1679 
1680 static void tcp_v4_init_req(struct request_sock *req,
1681 			    const struct sock *sk_listener,
1682 			    struct sk_buff *skb)
1683 {
1684 	struct inet_request_sock *ireq = inet_rsk(req);
1685 	struct net *net = sock_net(sk_listener);
1686 
1687 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1688 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1689 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1690 }
1691 
1692 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1693 					  struct sk_buff *skb,
1694 					  struct flowi *fl,
1695 					  struct request_sock *req,
1696 					  u32 tw_isn)
1697 {
1698 	tcp_v4_init_req(req, sk, skb);
1699 
1700 	if (security_inet_conn_request(sk, skb, req))
1701 		return NULL;
1702 
1703 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1704 }
1705 
1706 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1707 	.family		=	PF_INET,
1708 	.obj_size	=	sizeof(struct tcp_request_sock),
1709 	.send_ack	=	tcp_v4_reqsk_send_ack,
1710 	.destructor	=	tcp_v4_reqsk_destructor,
1711 	.send_reset	=	tcp_v4_send_reset,
1712 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1713 };
1714 
1715 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1716 	.mss_clamp	=	TCP_MSS_DEFAULT,
1717 #ifdef CONFIG_TCP_MD5SIG
1718 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1719 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1720 #endif
1721 #ifdef CONFIG_TCP_AO
1722 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1723 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1724 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1725 #endif
1726 #ifdef CONFIG_SYN_COOKIES
1727 	.cookie_init_seq =	cookie_v4_init_sequence,
1728 #endif
1729 	.route_req	=	tcp_v4_route_req,
1730 	.init_seq	=	tcp_v4_init_seq,
1731 	.init_ts_off	=	tcp_v4_init_ts_off,
1732 	.send_synack	=	tcp_v4_send_synack,
1733 };
1734 
1735 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1736 {
1737 	/* Never answer to SYNs send to broadcast or multicast */
1738 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1739 		goto drop;
1740 
1741 	return tcp_conn_request(&tcp_request_sock_ops,
1742 				&tcp_request_sock_ipv4_ops, sk, skb);
1743 
1744 drop:
1745 	tcp_listendrop(sk);
1746 	return 0;
1747 }
1748 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1749 
1750 
1751 /*
1752  * The three way handshake has completed - we got a valid synack -
1753  * now create the new socket.
1754  */
1755 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1756 				  struct request_sock *req,
1757 				  struct dst_entry *dst,
1758 				  struct request_sock *req_unhash,
1759 				  bool *own_req)
1760 {
1761 	struct inet_request_sock *ireq;
1762 	bool found_dup_sk = false;
1763 	struct inet_sock *newinet;
1764 	struct tcp_sock *newtp;
1765 	struct sock *newsk;
1766 #ifdef CONFIG_TCP_MD5SIG
1767 	const union tcp_md5_addr *addr;
1768 	struct tcp_md5sig_key *key;
1769 	int l3index;
1770 #endif
1771 	struct ip_options_rcu *inet_opt;
1772 
1773 	if (sk_acceptq_is_full(sk))
1774 		goto exit_overflow;
1775 
1776 	newsk = tcp_create_openreq_child(sk, req, skb);
1777 	if (!newsk)
1778 		goto exit_nonewsk;
1779 
1780 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1781 	inet_sk_rx_dst_set(newsk, skb);
1782 
1783 	newtp		      = tcp_sk(newsk);
1784 	newinet		      = inet_sk(newsk);
1785 	ireq		      = inet_rsk(req);
1786 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1787 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1788 	newinet->mc_index     = inet_iif(skb);
1789 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1790 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1791 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1792 	if (inet_opt)
1793 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1794 	atomic_set(&newinet->inet_id, get_random_u16());
1795 
1796 	/* Set ToS of the new socket based upon the value of incoming SYN.
1797 	 * ECT bits are set later in tcp_init_transfer().
1798 	 */
1799 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1800 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1801 
1802 	if (!dst) {
1803 		dst = inet_csk_route_child_sock(sk, newsk, req);
1804 		if (!dst)
1805 			goto put_and_exit;
1806 	} else {
1807 		/* syncookie case : see end of cookie_v4_check() */
1808 	}
1809 	sk_setup_caps(newsk, dst);
1810 
1811 	tcp_ca_openreq_child(newsk, dst);
1812 
1813 	tcp_sync_mss(newsk, dst_mtu(dst));
1814 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1815 
1816 	tcp_initialize_rcv_mss(newsk);
1817 
1818 #ifdef CONFIG_TCP_MD5SIG
1819 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1820 	/* Copy over the MD5 key from the original socket */
1821 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1822 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1823 	if (key && !tcp_rsk_used_ao(req)) {
1824 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1825 			goto put_and_exit;
1826 		sk_gso_disable(newsk);
1827 	}
1828 #endif
1829 #ifdef CONFIG_TCP_AO
1830 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1831 		goto put_and_exit; /* OOM, release back memory */
1832 #endif
1833 
1834 	if (__inet_inherit_port(sk, newsk) < 0)
1835 		goto put_and_exit;
1836 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1837 				       &found_dup_sk);
1838 	if (likely(*own_req)) {
1839 		tcp_move_syn(newtp, req);
1840 		ireq->ireq_opt = NULL;
1841 	} else {
1842 		newinet->inet_opt = NULL;
1843 
1844 		if (!req_unhash && found_dup_sk) {
1845 			/* This code path should only be executed in the
1846 			 * syncookie case only
1847 			 */
1848 			bh_unlock_sock(newsk);
1849 			sock_put(newsk);
1850 			newsk = NULL;
1851 		}
1852 	}
1853 	return newsk;
1854 
1855 exit_overflow:
1856 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1857 exit_nonewsk:
1858 	dst_release(dst);
1859 exit:
1860 	tcp_listendrop(sk);
1861 	return NULL;
1862 put_and_exit:
1863 	newinet->inet_opt = NULL;
1864 	inet_csk_prepare_forced_close(newsk);
1865 	tcp_done(newsk);
1866 	goto exit;
1867 }
1868 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1869 
1870 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1871 {
1872 #ifdef CONFIG_SYN_COOKIES
1873 	const struct tcphdr *th = tcp_hdr(skb);
1874 
1875 	if (!th->syn)
1876 		sk = cookie_v4_check(sk, skb);
1877 #endif
1878 	return sk;
1879 }
1880 
1881 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1882 			 struct tcphdr *th, u32 *cookie)
1883 {
1884 	u16 mss = 0;
1885 #ifdef CONFIG_SYN_COOKIES
1886 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1887 				    &tcp_request_sock_ipv4_ops, sk, th);
1888 	if (mss) {
1889 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1890 		tcp_synq_overflow(sk);
1891 	}
1892 #endif
1893 	return mss;
1894 }
1895 
1896 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1897 							   u32));
1898 /* The socket must have it's spinlock held when we get
1899  * here, unless it is a TCP_LISTEN socket.
1900  *
1901  * We have a potential double-lock case here, so even when
1902  * doing backlog processing we use the BH locking scheme.
1903  * This is because we cannot sleep with the original spinlock
1904  * held.
1905  */
1906 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1907 {
1908 	enum skb_drop_reason reason;
1909 	struct sock *rsk;
1910 
1911 	reason = psp_sk_rx_policy_check(sk, skb);
1912 	if (reason)
1913 		goto err_discard;
1914 
1915 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1916 		struct dst_entry *dst;
1917 
1918 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1919 						lockdep_sock_is_held(sk));
1920 
1921 		sock_rps_save_rxhash(sk, skb);
1922 		sk_mark_napi_id(sk, skb);
1923 		if (dst) {
1924 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1925 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1926 					     dst, 0)) {
1927 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1928 				dst_release(dst);
1929 			}
1930 		}
1931 		tcp_rcv_established(sk, skb);
1932 		return 0;
1933 	}
1934 
1935 	if (tcp_checksum_complete(skb))
1936 		goto csum_err;
1937 
1938 	if (sk->sk_state == TCP_LISTEN) {
1939 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1940 
1941 		if (!nsk)
1942 			return 0;
1943 		if (nsk != sk) {
1944 			reason = tcp_child_process(sk, nsk, skb);
1945 			if (reason) {
1946 				rsk = nsk;
1947 				goto reset;
1948 			}
1949 			return 0;
1950 		}
1951 	} else
1952 		sock_rps_save_rxhash(sk, skb);
1953 
1954 	reason = tcp_rcv_state_process(sk, skb);
1955 	if (reason) {
1956 		rsk = sk;
1957 		goto reset;
1958 	}
1959 	return 0;
1960 
1961 reset:
1962 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1963 discard:
1964 	sk_skb_reason_drop(sk, skb, reason);
1965 	/* Be careful here. If this function gets more complicated and
1966 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1967 	 * might be destroyed here. This current version compiles correctly,
1968 	 * but you have been warned.
1969 	 */
1970 	return 0;
1971 
1972 csum_err:
1973 	reason = SKB_DROP_REASON_TCP_CSUM;
1974 	trace_tcp_bad_csum(skb);
1975 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1976 err_discard:
1977 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1978 	goto discard;
1979 }
1980 EXPORT_SYMBOL(tcp_v4_do_rcv);
1981 
1982 int tcp_v4_early_demux(struct sk_buff *skb)
1983 {
1984 	struct net *net = dev_net_rcu(skb->dev);
1985 	const struct iphdr *iph;
1986 	const struct tcphdr *th;
1987 	struct sock *sk;
1988 
1989 	if (skb->pkt_type != PACKET_HOST)
1990 		return 0;
1991 
1992 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1993 		return 0;
1994 
1995 	iph = ip_hdr(skb);
1996 	th = tcp_hdr(skb);
1997 
1998 	if (th->doff < sizeof(struct tcphdr) / 4)
1999 		return 0;
2000 
2001 	sk = __inet_lookup_established(net, iph->saddr, th->source,
2002 				       iph->daddr, ntohs(th->dest),
2003 				       skb->skb_iif, inet_sdif(skb));
2004 	if (sk) {
2005 		skb->sk = sk;
2006 		skb->destructor = sock_edemux;
2007 		if (sk_fullsock(sk)) {
2008 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2009 
2010 			if (dst)
2011 				dst = dst_check(dst, 0);
2012 			if (dst &&
2013 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
2014 				skb_dst_set_noref(skb, dst);
2015 		}
2016 	}
2017 	return 0;
2018 }
2019 
2020 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2021 		     enum skb_drop_reason *reason)
2022 {
2023 	u32 tail_gso_size, tail_gso_segs;
2024 	struct skb_shared_info *shinfo;
2025 	const struct tcphdr *th;
2026 	struct tcphdr *thtail;
2027 	struct sk_buff *tail;
2028 	unsigned int hdrlen;
2029 	bool fragstolen;
2030 	u32 gso_segs;
2031 	u32 gso_size;
2032 	u64 limit;
2033 	int delta;
2034 	int err;
2035 
2036 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2037 	 * we can fix skb->truesize to its real value to avoid future drops.
2038 	 * This is valid because skb is not yet charged to the socket.
2039 	 * It has been noticed pure SACK packets were sometimes dropped
2040 	 * (if cooked by drivers without copybreak feature).
2041 	 */
2042 	skb_condense(skb);
2043 
2044 	tcp_cleanup_skb(skb);
2045 
2046 	if (unlikely(tcp_checksum_complete(skb))) {
2047 		bh_unlock_sock(sk);
2048 		trace_tcp_bad_csum(skb);
2049 		*reason = SKB_DROP_REASON_TCP_CSUM;
2050 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2051 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2052 		return true;
2053 	}
2054 
2055 	/* Attempt coalescing to last skb in backlog, even if we are
2056 	 * above the limits.
2057 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2058 	 */
2059 	th = (const struct tcphdr *)skb->data;
2060 	hdrlen = th->doff * 4;
2061 
2062 	tail = sk->sk_backlog.tail;
2063 	if (!tail)
2064 		goto no_coalesce;
2065 	thtail = (struct tcphdr *)tail->data;
2066 
2067 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2068 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2069 	    ((TCP_SKB_CB(tail)->tcp_flags |
2070 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2071 	    !((TCP_SKB_CB(tail)->tcp_flags &
2072 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2073 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2074 	      TCP_SKB_CB(skb)->tcp_flags) &
2075 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2076 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2077 	    thtail->doff != th->doff ||
2078 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
2079 	    /* prior to PSP Rx policy check, retain exact PSP metadata */
2080 	    psp_skb_coalesce_diff(tail, skb))
2081 		goto no_coalesce;
2082 
2083 	__skb_pull(skb, hdrlen);
2084 
2085 	shinfo = skb_shinfo(skb);
2086 	gso_size = shinfo->gso_size ?: skb->len;
2087 	gso_segs = shinfo->gso_segs ?: 1;
2088 
2089 	shinfo = skb_shinfo(tail);
2090 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2091 	tail_gso_segs = shinfo->gso_segs ?: 1;
2092 
2093 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2094 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2095 
2096 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2097 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2098 			thtail->window = th->window;
2099 		}
2100 
2101 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2102 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2103 		 * is not entered if we append a packet with a FIN.
2104 		 * SYN, RST, URG are not present.
2105 		 * ACK is set on both packets.
2106 		 * PSH : we do not really care in TCP stack,
2107 		 *       at least for 'GRO' packets.
2108 		 */
2109 		thtail->fin |= th->fin;
2110 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2111 
2112 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2113 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2114 			tail->tstamp = skb->tstamp;
2115 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2116 		}
2117 
2118 		/* Not as strict as GRO. We only need to carry mss max value */
2119 		shinfo->gso_size = max(gso_size, tail_gso_size);
2120 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2121 
2122 		sk->sk_backlog.len += delta;
2123 		__NET_INC_STATS(sock_net(sk),
2124 				LINUX_MIB_TCPBACKLOGCOALESCE);
2125 		kfree_skb_partial(skb, fragstolen);
2126 		return false;
2127 	}
2128 	__skb_push(skb, hdrlen);
2129 
2130 no_coalesce:
2131 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2132 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2133 	 * sk_rcvbuf in normal conditions.
2134 	 */
2135 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2136 
2137 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2138 
2139 	/* Only socket owner can try to collapse/prune rx queues
2140 	 * to reduce memory overhead, so add a little headroom here.
2141 	 * Few sockets backlog are possibly concurrently non empty.
2142 	 */
2143 	limit += 64 * 1024;
2144 
2145 	limit = min_t(u64, limit, UINT_MAX);
2146 
2147 	err = sk_add_backlog(sk, skb, limit);
2148 	if (unlikely(err)) {
2149 		bh_unlock_sock(sk);
2150 		if (err == -ENOMEM) {
2151 			*reason = SKB_DROP_REASON_PFMEMALLOC;
2152 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2153 		} else {
2154 			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2155 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2156 		}
2157 		return true;
2158 	}
2159 	return false;
2160 }
2161 EXPORT_IPV6_MOD(tcp_add_backlog);
2162 
2163 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
2164 {
2165 	struct tcphdr *th = (struct tcphdr *)skb->data;
2166 
2167 	return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
2168 }
2169 EXPORT_IPV6_MOD(tcp_filter);
2170 
2171 static void tcp_v4_restore_cb(struct sk_buff *skb)
2172 {
2173 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2174 		sizeof(struct inet_skb_parm));
2175 }
2176 
2177 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2178 			   const struct tcphdr *th)
2179 {
2180 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2181 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2182 	 */
2183 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2184 		sizeof(struct inet_skb_parm));
2185 	barrier();
2186 
2187 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2188 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2189 				    skb->len - th->doff * 4);
2190 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2191 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2192 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2193 	TCP_SKB_CB(skb)->sacked	 = 0;
2194 	TCP_SKB_CB(skb)->has_rxtstamp =
2195 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2196 }
2197 
2198 /*
2199  *	From tcp_input.c
2200  */
2201 
2202 int tcp_v4_rcv(struct sk_buff *skb)
2203 {
2204 	struct net *net = dev_net_rcu(skb->dev);
2205 	enum skb_drop_reason drop_reason;
2206 	enum tcp_tw_status tw_status;
2207 	int sdif = inet_sdif(skb);
2208 	int dif = inet_iif(skb);
2209 	const struct iphdr *iph;
2210 	const struct tcphdr *th;
2211 	struct sock *sk = NULL;
2212 	bool refcounted;
2213 	int ret;
2214 	u32 isn;
2215 
2216 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2217 	if (skb->pkt_type != PACKET_HOST)
2218 		goto discard_it;
2219 
2220 	/* Count it even if it's bad */
2221 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2222 
2223 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2224 		goto discard_it;
2225 
2226 	th = (const struct tcphdr *)skb->data;
2227 
2228 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2229 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2230 		goto bad_packet;
2231 	}
2232 	if (!pskb_may_pull(skb, th->doff * 4))
2233 		goto discard_it;
2234 
2235 	/* An explanation is required here, I think.
2236 	 * Packet length and doff are validated by header prediction,
2237 	 * provided case of th->doff==0 is eliminated.
2238 	 * So, we defer the checks. */
2239 
2240 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2241 		goto csum_error;
2242 
2243 	th = (const struct tcphdr *)skb->data;
2244 	iph = ip_hdr(skb);
2245 lookup:
2246 	sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2247 			       th->dest, sdif, &refcounted);
2248 	if (!sk)
2249 		goto no_tcp_socket;
2250 
2251 	if (sk->sk_state == TCP_TIME_WAIT)
2252 		goto do_time_wait;
2253 
2254 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2255 		struct request_sock *req = inet_reqsk(sk);
2256 		bool req_stolen = false;
2257 		struct sock *nsk;
2258 
2259 		sk = req->rsk_listener;
2260 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2261 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2262 		else
2263 			drop_reason = tcp_inbound_hash(sk, req, skb,
2264 						       &iph->saddr, &iph->daddr,
2265 						       AF_INET, dif, sdif);
2266 		if (unlikely(drop_reason)) {
2267 			sk_drops_skbadd(sk, skb);
2268 			reqsk_put(req);
2269 			goto discard_it;
2270 		}
2271 		if (tcp_checksum_complete(skb)) {
2272 			reqsk_put(req);
2273 			goto csum_error;
2274 		}
2275 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2276 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2277 			if (!nsk) {
2278 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2279 				goto lookup;
2280 			}
2281 			sk = nsk;
2282 			/* reuseport_migrate_sock() has already held one sk_refcnt
2283 			 * before returning.
2284 			 */
2285 		} else {
2286 			/* We own a reference on the listener, increase it again
2287 			 * as we might lose it too soon.
2288 			 */
2289 			sock_hold(sk);
2290 		}
2291 		refcounted = true;
2292 		nsk = NULL;
2293 		if (!tcp_filter(sk, skb, &drop_reason)) {
2294 			th = (const struct tcphdr *)skb->data;
2295 			iph = ip_hdr(skb);
2296 			tcp_v4_fill_cb(skb, iph, th);
2297 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2298 					    &drop_reason);
2299 		}
2300 		if (!nsk) {
2301 			reqsk_put(req);
2302 			if (req_stolen) {
2303 				/* Another cpu got exclusive access to req
2304 				 * and created a full blown socket.
2305 				 * Try to feed this packet to this socket
2306 				 * instead of discarding it.
2307 				 */
2308 				tcp_v4_restore_cb(skb);
2309 				sock_put(sk);
2310 				goto lookup;
2311 			}
2312 			goto discard_and_relse;
2313 		}
2314 		nf_reset_ct(skb);
2315 		if (nsk == sk) {
2316 			reqsk_put(req);
2317 			tcp_v4_restore_cb(skb);
2318 		} else {
2319 			drop_reason = tcp_child_process(sk, nsk, skb);
2320 			if (drop_reason) {
2321 				enum sk_rst_reason rst_reason;
2322 
2323 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2324 				tcp_v4_send_reset(nsk, skb, rst_reason);
2325 				goto discard_and_relse;
2326 			}
2327 			sock_put(sk);
2328 			return 0;
2329 		}
2330 	}
2331 
2332 process:
2333 	if (static_branch_unlikely(&ip4_min_ttl)) {
2334 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2335 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2336 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2337 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2338 			goto discard_and_relse;
2339 		}
2340 	}
2341 
2342 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2343 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2344 		goto discard_and_relse;
2345 	}
2346 
2347 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2348 				       AF_INET, dif, sdif);
2349 	if (drop_reason)
2350 		goto discard_and_relse;
2351 
2352 	nf_reset_ct(skb);
2353 
2354 	if (tcp_filter(sk, skb, &drop_reason))
2355 		goto discard_and_relse;
2356 
2357 	th = (const struct tcphdr *)skb->data;
2358 	iph = ip_hdr(skb);
2359 	tcp_v4_fill_cb(skb, iph, th);
2360 
2361 	skb->dev = NULL;
2362 
2363 	if (sk->sk_state == TCP_LISTEN) {
2364 		ret = tcp_v4_do_rcv(sk, skb);
2365 		goto put_and_return;
2366 	}
2367 
2368 	sk_incoming_cpu_update(sk);
2369 
2370 	bh_lock_sock_nested(sk);
2371 	tcp_segs_in(tcp_sk(sk), skb);
2372 	ret = 0;
2373 	if (!sock_owned_by_user(sk)) {
2374 		ret = tcp_v4_do_rcv(sk, skb);
2375 	} else {
2376 		if (tcp_add_backlog(sk, skb, &drop_reason))
2377 			goto discard_and_relse;
2378 	}
2379 	bh_unlock_sock(sk);
2380 
2381 put_and_return:
2382 	if (refcounted)
2383 		sock_put(sk);
2384 
2385 	return ret;
2386 
2387 no_tcp_socket:
2388 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2389 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2390 		goto discard_it;
2391 
2392 	tcp_v4_fill_cb(skb, iph, th);
2393 
2394 	if (tcp_checksum_complete(skb)) {
2395 csum_error:
2396 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2397 		trace_tcp_bad_csum(skb);
2398 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2399 bad_packet:
2400 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2401 	} else {
2402 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2403 	}
2404 
2405 discard_it:
2406 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2407 	/* Discard frame. */
2408 	sk_skb_reason_drop(sk, skb, drop_reason);
2409 	return 0;
2410 
2411 discard_and_relse:
2412 	sk_drops_skbadd(sk, skb);
2413 	if (refcounted)
2414 		sock_put(sk);
2415 	goto discard_it;
2416 
2417 do_time_wait:
2418 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2419 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2420 		inet_twsk_put(inet_twsk(sk));
2421 		goto discard_it;
2422 	}
2423 
2424 	tcp_v4_fill_cb(skb, iph, th);
2425 
2426 	if (tcp_checksum_complete(skb)) {
2427 		inet_twsk_put(inet_twsk(sk));
2428 		goto csum_error;
2429 	}
2430 
2431 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2432 					       &drop_reason);
2433 	switch (tw_status) {
2434 	case TCP_TW_SYN: {
2435 		struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2436 							iph->saddr, th->source,
2437 							iph->daddr, th->dest,
2438 							inet_iif(skb),
2439 							sdif);
2440 		if (sk2) {
2441 			inet_twsk_deschedule_put(inet_twsk(sk));
2442 			sk = sk2;
2443 			tcp_v4_restore_cb(skb);
2444 			refcounted = false;
2445 			__this_cpu_write(tcp_tw_isn, isn);
2446 			goto process;
2447 		}
2448 
2449 		drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2450 		if (drop_reason)
2451 			break;
2452 	}
2453 		/* to ACK */
2454 		fallthrough;
2455 	case TCP_TW_ACK:
2456 	case TCP_TW_ACK_OOW:
2457 		tcp_v4_timewait_ack(sk, skb, tw_status);
2458 		break;
2459 	case TCP_TW_RST:
2460 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2461 		inet_twsk_deschedule_put(inet_twsk(sk));
2462 		goto discard_it;
2463 	case TCP_TW_SUCCESS:;
2464 	}
2465 	goto discard_it;
2466 }
2467 
2468 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2469 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2470 };
2471 
2472 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2473 {
2474 	struct dst_entry *dst = skb_dst(skb);
2475 
2476 	if (dst && dst_hold_safe(dst)) {
2477 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2478 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2479 	}
2480 }
2481 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2482 
2483 const struct inet_connection_sock_af_ops ipv4_specific = {
2484 	.queue_xmit	   = ip_queue_xmit,
2485 	.send_check	   = tcp_v4_send_check,
2486 	.rebuild_header	   = inet_sk_rebuild_header,
2487 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2488 	.conn_request	   = tcp_v4_conn_request,
2489 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2490 	.net_header_len	   = sizeof(struct iphdr),
2491 	.setsockopt	   = ip_setsockopt,
2492 	.getsockopt	   = ip_getsockopt,
2493 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2494 };
2495 EXPORT_IPV6_MOD(ipv4_specific);
2496 
2497 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2498 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2499 #ifdef CONFIG_TCP_MD5SIG
2500 	.md5_lookup		= tcp_v4_md5_lookup,
2501 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2502 	.md5_parse		= tcp_v4_parse_md5_keys,
2503 #endif
2504 #ifdef CONFIG_TCP_AO
2505 	.ao_lookup		= tcp_v4_ao_lookup,
2506 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2507 	.ao_parse		= tcp_v4_parse_ao,
2508 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2509 #endif
2510 };
2511 
2512 static void tcp4_destruct_sock(struct sock *sk)
2513 {
2514 	tcp_md5_destruct_sock(sk);
2515 	tcp_ao_destroy_sock(sk, false);
2516 	inet_sock_destruct(sk);
2517 }
2518 #endif
2519 
2520 /* NOTE: A lot of things set to zero explicitly by call to
2521  *       sk_alloc() so need not be done here.
2522  */
2523 static int tcp_v4_init_sock(struct sock *sk)
2524 {
2525 	struct inet_connection_sock *icsk = inet_csk(sk);
2526 
2527 	tcp_init_sock(sk);
2528 
2529 	icsk->icsk_af_ops = &ipv4_specific;
2530 
2531 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2532 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2533 	sk->sk_destruct = tcp4_destruct_sock;
2534 #endif
2535 
2536 	return 0;
2537 }
2538 
2539 static void tcp_release_user_frags(struct sock *sk)
2540 {
2541 #ifdef CONFIG_PAGE_POOL
2542 	unsigned long index;
2543 	void *netmem;
2544 
2545 	xa_for_each(&sk->sk_user_frags, index, netmem)
2546 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2547 #endif
2548 }
2549 
2550 void tcp_v4_destroy_sock(struct sock *sk)
2551 {
2552 	struct tcp_sock *tp = tcp_sk(sk);
2553 
2554 	tcp_release_user_frags(sk);
2555 
2556 	xa_destroy(&sk->sk_user_frags);
2557 
2558 	trace_tcp_destroy_sock(sk);
2559 
2560 	tcp_clear_xmit_timers(sk);
2561 
2562 	tcp_cleanup_congestion_control(sk);
2563 
2564 	tcp_cleanup_ulp(sk);
2565 
2566 	/* Cleanup up the write buffer. */
2567 	tcp_write_queue_purge(sk);
2568 
2569 	/* Check if we want to disable active TFO */
2570 	tcp_fastopen_active_disable_ofo_check(sk);
2571 
2572 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2573 	skb_rbtree_purge(&tp->out_of_order_queue);
2574 
2575 	/* Clean up a referenced TCP bind bucket. */
2576 	if (inet_csk(sk)->icsk_bind_hash)
2577 		inet_put_port(sk);
2578 
2579 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2580 
2581 	/* If socket is aborted during connect operation */
2582 	tcp_free_fastopen_req(tp);
2583 	tcp_fastopen_destroy_cipher(sk);
2584 	tcp_saved_syn_free(tp);
2585 
2586 	sk_sockets_allocated_dec(sk);
2587 }
2588 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2589 
2590 #ifdef CONFIG_PROC_FS
2591 /* Proc filesystem TCP sock list dumping. */
2592 
2593 static unsigned short seq_file_family(const struct seq_file *seq);
2594 
2595 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2596 {
2597 	unsigned short family = seq_file_family(seq);
2598 
2599 	/* AF_UNSPEC is used as a match all */
2600 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2601 		net_eq(sock_net(sk), seq_file_net(seq)));
2602 }
2603 
2604 /* Find a non empty bucket (starting from st->bucket)
2605  * and return the first sk from it.
2606  */
2607 static void *listening_get_first(struct seq_file *seq)
2608 {
2609 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2610 	struct tcp_iter_state *st = seq->private;
2611 
2612 	st->offset = 0;
2613 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2614 		struct inet_listen_hashbucket *ilb2;
2615 		struct hlist_nulls_node *node;
2616 		struct sock *sk;
2617 
2618 		ilb2 = &hinfo->lhash2[st->bucket];
2619 		if (hlist_nulls_empty(&ilb2->nulls_head))
2620 			continue;
2621 
2622 		spin_lock(&ilb2->lock);
2623 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2624 			if (seq_sk_match(seq, sk))
2625 				return sk;
2626 		}
2627 		spin_unlock(&ilb2->lock);
2628 	}
2629 
2630 	return NULL;
2631 }
2632 
2633 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2634  * If "cur" is the last one in the st->bucket,
2635  * call listening_get_first() to return the first sk of the next
2636  * non empty bucket.
2637  */
2638 static void *listening_get_next(struct seq_file *seq, void *cur)
2639 {
2640 	struct tcp_iter_state *st = seq->private;
2641 	struct inet_listen_hashbucket *ilb2;
2642 	struct hlist_nulls_node *node;
2643 	struct inet_hashinfo *hinfo;
2644 	struct sock *sk = cur;
2645 
2646 	++st->num;
2647 	++st->offset;
2648 
2649 	sk = sk_nulls_next(sk);
2650 	sk_nulls_for_each_from(sk, node) {
2651 		if (seq_sk_match(seq, sk))
2652 			return sk;
2653 	}
2654 
2655 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2656 	ilb2 = &hinfo->lhash2[st->bucket];
2657 	spin_unlock(&ilb2->lock);
2658 	++st->bucket;
2659 	return listening_get_first(seq);
2660 }
2661 
2662 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2663 {
2664 	struct tcp_iter_state *st = seq->private;
2665 	void *rc;
2666 
2667 	st->bucket = 0;
2668 	st->offset = 0;
2669 	rc = listening_get_first(seq);
2670 
2671 	while (rc && *pos) {
2672 		rc = listening_get_next(seq, rc);
2673 		--*pos;
2674 	}
2675 	return rc;
2676 }
2677 
2678 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2679 				const struct tcp_iter_state *st)
2680 {
2681 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2682 }
2683 
2684 /*
2685  * Get first established socket starting from bucket given in st->bucket.
2686  * If st->bucket is zero, the very first socket in the hash is returned.
2687  */
2688 static void *established_get_first(struct seq_file *seq)
2689 {
2690 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2691 	struct tcp_iter_state *st = seq->private;
2692 
2693 	st->offset = 0;
2694 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2695 		struct sock *sk;
2696 		struct hlist_nulls_node *node;
2697 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2698 
2699 		cond_resched();
2700 
2701 		/* Lockless fast path for the common case of empty buckets */
2702 		if (empty_bucket(hinfo, st))
2703 			continue;
2704 
2705 		spin_lock_bh(lock);
2706 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2707 			if (seq_sk_match(seq, sk))
2708 				return sk;
2709 		}
2710 		spin_unlock_bh(lock);
2711 	}
2712 
2713 	return NULL;
2714 }
2715 
2716 static void *established_get_next(struct seq_file *seq, void *cur)
2717 {
2718 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2719 	struct tcp_iter_state *st = seq->private;
2720 	struct hlist_nulls_node *node;
2721 	struct sock *sk = cur;
2722 
2723 	++st->num;
2724 	++st->offset;
2725 
2726 	sk = sk_nulls_next(sk);
2727 
2728 	sk_nulls_for_each_from(sk, node) {
2729 		if (seq_sk_match(seq, sk))
2730 			return sk;
2731 	}
2732 
2733 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2734 	++st->bucket;
2735 	return established_get_first(seq);
2736 }
2737 
2738 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2739 {
2740 	struct tcp_iter_state *st = seq->private;
2741 	void *rc;
2742 
2743 	st->bucket = 0;
2744 	rc = established_get_first(seq);
2745 
2746 	while (rc && pos) {
2747 		rc = established_get_next(seq, rc);
2748 		--pos;
2749 	}
2750 	return rc;
2751 }
2752 
2753 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2754 {
2755 	void *rc;
2756 	struct tcp_iter_state *st = seq->private;
2757 
2758 	st->state = TCP_SEQ_STATE_LISTENING;
2759 	rc	  = listening_get_idx(seq, &pos);
2760 
2761 	if (!rc) {
2762 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2763 		rc	  = established_get_idx(seq, pos);
2764 	}
2765 
2766 	return rc;
2767 }
2768 
2769 static void *tcp_seek_last_pos(struct seq_file *seq)
2770 {
2771 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2772 	struct tcp_iter_state *st = seq->private;
2773 	int bucket = st->bucket;
2774 	int offset = st->offset;
2775 	int orig_num = st->num;
2776 	void *rc = NULL;
2777 
2778 	switch (st->state) {
2779 	case TCP_SEQ_STATE_LISTENING:
2780 		if (st->bucket > hinfo->lhash2_mask)
2781 			break;
2782 		rc = listening_get_first(seq);
2783 		while (offset-- && rc && bucket == st->bucket)
2784 			rc = listening_get_next(seq, rc);
2785 		if (rc)
2786 			break;
2787 		st->bucket = 0;
2788 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2789 		fallthrough;
2790 	case TCP_SEQ_STATE_ESTABLISHED:
2791 		if (st->bucket > hinfo->ehash_mask)
2792 			break;
2793 		rc = established_get_first(seq);
2794 		while (offset-- && rc && bucket == st->bucket)
2795 			rc = established_get_next(seq, rc);
2796 	}
2797 
2798 	st->num = orig_num;
2799 
2800 	return rc;
2801 }
2802 
2803 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2804 {
2805 	struct tcp_iter_state *st = seq->private;
2806 	void *rc;
2807 
2808 	if (*pos && *pos == st->last_pos) {
2809 		rc = tcp_seek_last_pos(seq);
2810 		if (rc)
2811 			goto out;
2812 	}
2813 
2814 	st->state = TCP_SEQ_STATE_LISTENING;
2815 	st->num = 0;
2816 	st->bucket = 0;
2817 	st->offset = 0;
2818 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2819 
2820 out:
2821 	st->last_pos = *pos;
2822 	return rc;
2823 }
2824 EXPORT_IPV6_MOD(tcp_seq_start);
2825 
2826 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2827 {
2828 	struct tcp_iter_state *st = seq->private;
2829 	void *rc = NULL;
2830 
2831 	if (v == SEQ_START_TOKEN) {
2832 		rc = tcp_get_idx(seq, 0);
2833 		goto out;
2834 	}
2835 
2836 	switch (st->state) {
2837 	case TCP_SEQ_STATE_LISTENING:
2838 		rc = listening_get_next(seq, v);
2839 		if (!rc) {
2840 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2841 			st->bucket = 0;
2842 			st->offset = 0;
2843 			rc	  = established_get_first(seq);
2844 		}
2845 		break;
2846 	case TCP_SEQ_STATE_ESTABLISHED:
2847 		rc = established_get_next(seq, v);
2848 		break;
2849 	}
2850 out:
2851 	++*pos;
2852 	st->last_pos = *pos;
2853 	return rc;
2854 }
2855 EXPORT_IPV6_MOD(tcp_seq_next);
2856 
2857 void tcp_seq_stop(struct seq_file *seq, void *v)
2858 {
2859 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2860 	struct tcp_iter_state *st = seq->private;
2861 
2862 	switch (st->state) {
2863 	case TCP_SEQ_STATE_LISTENING:
2864 		if (v != SEQ_START_TOKEN)
2865 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2866 		break;
2867 	case TCP_SEQ_STATE_ESTABLISHED:
2868 		if (v)
2869 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2870 		break;
2871 	}
2872 }
2873 EXPORT_IPV6_MOD(tcp_seq_stop);
2874 
2875 static void get_openreq4(const struct request_sock *req,
2876 			 struct seq_file *f, int i)
2877 {
2878 	const struct inet_request_sock *ireq = inet_rsk(req);
2879 	long delta = req->rsk_timer.expires - jiffies;
2880 
2881 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2882 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2883 		i,
2884 		ireq->ir_loc_addr,
2885 		ireq->ir_num,
2886 		ireq->ir_rmt_addr,
2887 		ntohs(ireq->ir_rmt_port),
2888 		TCP_SYN_RECV,
2889 		0, 0, /* could print option size, but that is af dependent. */
2890 		1,    /* timers active (only the expire timer) */
2891 		jiffies_delta_to_clock_t(delta),
2892 		req->num_timeout,
2893 		from_kuid_munged(seq_user_ns(f),
2894 				 sk_uid(req->rsk_listener)),
2895 		0,  /* non standard timer */
2896 		0, /* open_requests have no inode */
2897 		0,
2898 		req);
2899 }
2900 
2901 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2902 {
2903 	int timer_active;
2904 	unsigned long timer_expires;
2905 	const struct tcp_sock *tp = tcp_sk(sk);
2906 	const struct inet_connection_sock *icsk = inet_csk(sk);
2907 	const struct inet_sock *inet = inet_sk(sk);
2908 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2909 	__be32 dest = inet->inet_daddr;
2910 	__be32 src = inet->inet_rcv_saddr;
2911 	__u16 destp = ntohs(inet->inet_dport);
2912 	__u16 srcp = ntohs(inet->inet_sport);
2913 	u8 icsk_pending;
2914 	int rx_queue;
2915 	int state;
2916 
2917 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2918 	if (icsk_pending == ICSK_TIME_RETRANS ||
2919 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2920 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2921 		timer_active	= 1;
2922 		timer_expires	= icsk_timeout(icsk);
2923 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2924 		timer_active	= 4;
2925 		timer_expires	= icsk_timeout(icsk);
2926 	} else if (timer_pending(&sk->sk_timer)) {
2927 		timer_active	= 2;
2928 		timer_expires	= sk->sk_timer.expires;
2929 	} else {
2930 		timer_active	= 0;
2931 		timer_expires = jiffies;
2932 	}
2933 
2934 	state = inet_sk_state_load(sk);
2935 	if (state == TCP_LISTEN)
2936 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2937 	else
2938 		/* Because we don't lock the socket,
2939 		 * we might find a transient negative value.
2940 		 */
2941 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2942 				      READ_ONCE(tp->copied_seq), 0);
2943 
2944 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2945 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2946 		i, src, srcp, dest, destp, state,
2947 		READ_ONCE(tp->write_seq) - tp->snd_una,
2948 		rx_queue,
2949 		timer_active,
2950 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2951 		READ_ONCE(icsk->icsk_retransmits),
2952 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2953 		READ_ONCE(icsk->icsk_probes_out),
2954 		sock_i_ino(sk),
2955 		refcount_read(&sk->sk_refcnt), sk,
2956 		jiffies_to_clock_t(icsk->icsk_rto),
2957 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2958 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2959 		tcp_snd_cwnd(tp),
2960 		state == TCP_LISTEN ?
2961 		    fastopenq->max_qlen :
2962 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2963 }
2964 
2965 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2966 			       struct seq_file *f, int i)
2967 {
2968 	long delta = tw->tw_timer.expires - jiffies;
2969 	__be32 dest, src;
2970 	__u16 destp, srcp;
2971 
2972 	dest  = tw->tw_daddr;
2973 	src   = tw->tw_rcv_saddr;
2974 	destp = ntohs(tw->tw_dport);
2975 	srcp  = ntohs(tw->tw_sport);
2976 
2977 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2978 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2979 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2980 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2981 		refcount_read(&tw->tw_refcnt), tw);
2982 }
2983 
2984 #define TMPSZ 150
2985 
2986 static int tcp4_seq_show(struct seq_file *seq, void *v)
2987 {
2988 	struct tcp_iter_state *st;
2989 	struct sock *sk = v;
2990 
2991 	seq_setwidth(seq, TMPSZ - 1);
2992 	if (v == SEQ_START_TOKEN) {
2993 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2994 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2995 			   "inode");
2996 		goto out;
2997 	}
2998 	st = seq->private;
2999 
3000 	if (sk->sk_state == TCP_TIME_WAIT)
3001 		get_timewait4_sock(v, seq, st->num);
3002 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
3003 		get_openreq4(v, seq, st->num);
3004 	else
3005 		get_tcp4_sock(v, seq, st->num);
3006 out:
3007 	seq_pad(seq, '\n');
3008 	return 0;
3009 }
3010 
3011 #ifdef CONFIG_BPF_SYSCALL
3012 union bpf_tcp_iter_batch_item {
3013 	struct sock *sk;
3014 	__u64 cookie;
3015 };
3016 
3017 struct bpf_tcp_iter_state {
3018 	struct tcp_iter_state state;
3019 	unsigned int cur_sk;
3020 	unsigned int end_sk;
3021 	unsigned int max_sk;
3022 	union bpf_tcp_iter_batch_item *batch;
3023 };
3024 
3025 struct bpf_iter__tcp {
3026 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3027 	__bpf_md_ptr(struct sock_common *, sk_common);
3028 	uid_t uid __aligned(8);
3029 };
3030 
3031 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3032 			     struct sock_common *sk_common, uid_t uid)
3033 {
3034 	struct bpf_iter__tcp ctx;
3035 
3036 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3037 	ctx.meta = meta;
3038 	ctx.sk_common = sk_common;
3039 	ctx.uid = uid;
3040 	return bpf_iter_run_prog(prog, &ctx);
3041 }
3042 
3043 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3044 {
3045 	union bpf_tcp_iter_batch_item *item;
3046 	unsigned int cur_sk = iter->cur_sk;
3047 	__u64 cookie;
3048 
3049 	/* Remember the cookies of the sockets we haven't seen yet, so we can
3050 	 * pick up where we left off next time around.
3051 	 */
3052 	while (cur_sk < iter->end_sk) {
3053 		item = &iter->batch[cur_sk++];
3054 		cookie = sock_gen_cookie(item->sk);
3055 		sock_gen_put(item->sk);
3056 		item->cookie = cookie;
3057 	}
3058 }
3059 
3060 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3061 				      unsigned int new_batch_sz, gfp_t flags)
3062 {
3063 	union bpf_tcp_iter_batch_item *new_batch;
3064 
3065 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3066 			     flags | __GFP_NOWARN);
3067 	if (!new_batch)
3068 		return -ENOMEM;
3069 
3070 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3071 	kvfree(iter->batch);
3072 	iter->batch = new_batch;
3073 	iter->max_sk = new_batch_sz;
3074 
3075 	return 0;
3076 }
3077 
3078 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3079 					       union bpf_tcp_iter_batch_item *cookies,
3080 					       int n_cookies)
3081 {
3082 	struct hlist_nulls_node *node;
3083 	struct sock *sk;
3084 	int i;
3085 
3086 	for (i = 0; i < n_cookies; i++) {
3087 		sk = first_sk;
3088 		sk_nulls_for_each_from(sk, node)
3089 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3090 				return sk;
3091 	}
3092 
3093 	return NULL;
3094 }
3095 
3096 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3097 {
3098 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3099 	struct bpf_tcp_iter_state *iter = seq->private;
3100 	struct tcp_iter_state *st = &iter->state;
3101 	unsigned int find_cookie = iter->cur_sk;
3102 	unsigned int end_cookie = iter->end_sk;
3103 	int resume_bucket = st->bucket;
3104 	struct sock *sk;
3105 
3106 	if (end_cookie && find_cookie == end_cookie)
3107 		++st->bucket;
3108 
3109 	sk = listening_get_first(seq);
3110 	iter->cur_sk = 0;
3111 	iter->end_sk = 0;
3112 
3113 	if (sk && st->bucket == resume_bucket && end_cookie) {
3114 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3115 						end_cookie - find_cookie);
3116 		if (!sk) {
3117 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
3118 			++st->bucket;
3119 			sk = listening_get_first(seq);
3120 		}
3121 	}
3122 
3123 	return sk;
3124 }
3125 
3126 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3127 {
3128 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3129 	struct bpf_tcp_iter_state *iter = seq->private;
3130 	struct tcp_iter_state *st = &iter->state;
3131 	unsigned int find_cookie = iter->cur_sk;
3132 	unsigned int end_cookie = iter->end_sk;
3133 	int resume_bucket = st->bucket;
3134 	struct sock *sk;
3135 
3136 	if (end_cookie && find_cookie == end_cookie)
3137 		++st->bucket;
3138 
3139 	sk = established_get_first(seq);
3140 	iter->cur_sk = 0;
3141 	iter->end_sk = 0;
3142 
3143 	if (sk && st->bucket == resume_bucket && end_cookie) {
3144 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3145 						end_cookie - find_cookie);
3146 		if (!sk) {
3147 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3148 			++st->bucket;
3149 			sk = established_get_first(seq);
3150 		}
3151 	}
3152 
3153 	return sk;
3154 }
3155 
3156 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3157 {
3158 	struct bpf_tcp_iter_state *iter = seq->private;
3159 	struct tcp_iter_state *st = &iter->state;
3160 	struct sock *sk = NULL;
3161 
3162 	switch (st->state) {
3163 	case TCP_SEQ_STATE_LISTENING:
3164 		sk = bpf_iter_tcp_resume_listening(seq);
3165 		if (sk)
3166 			break;
3167 		st->bucket = 0;
3168 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3169 		fallthrough;
3170 	case TCP_SEQ_STATE_ESTABLISHED:
3171 		sk = bpf_iter_tcp_resume_established(seq);
3172 		break;
3173 	}
3174 
3175 	return sk;
3176 }
3177 
3178 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3179 						 struct sock **start_sk)
3180 {
3181 	struct bpf_tcp_iter_state *iter = seq->private;
3182 	struct hlist_nulls_node *node;
3183 	unsigned int expected = 1;
3184 	struct sock *sk;
3185 
3186 	sock_hold(*start_sk);
3187 	iter->batch[iter->end_sk++].sk = *start_sk;
3188 
3189 	sk = sk_nulls_next(*start_sk);
3190 	*start_sk = NULL;
3191 	sk_nulls_for_each_from(sk, node) {
3192 		if (seq_sk_match(seq, sk)) {
3193 			if (iter->end_sk < iter->max_sk) {
3194 				sock_hold(sk);
3195 				iter->batch[iter->end_sk++].sk = sk;
3196 			} else if (!*start_sk) {
3197 				/* Remember where we left off. */
3198 				*start_sk = sk;
3199 			}
3200 			expected++;
3201 		}
3202 	}
3203 
3204 	return expected;
3205 }
3206 
3207 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3208 						   struct sock **start_sk)
3209 {
3210 	struct bpf_tcp_iter_state *iter = seq->private;
3211 	struct hlist_nulls_node *node;
3212 	unsigned int expected = 1;
3213 	struct sock *sk;
3214 
3215 	sock_hold(*start_sk);
3216 	iter->batch[iter->end_sk++].sk = *start_sk;
3217 
3218 	sk = sk_nulls_next(*start_sk);
3219 	*start_sk = NULL;
3220 	sk_nulls_for_each_from(sk, node) {
3221 		if (seq_sk_match(seq, sk)) {
3222 			if (iter->end_sk < iter->max_sk) {
3223 				sock_hold(sk);
3224 				iter->batch[iter->end_sk++].sk = sk;
3225 			} else if (!*start_sk) {
3226 				/* Remember where we left off. */
3227 				*start_sk = sk;
3228 			}
3229 			expected++;
3230 		}
3231 	}
3232 
3233 	return expected;
3234 }
3235 
3236 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3237 					struct sock **start_sk)
3238 {
3239 	struct bpf_tcp_iter_state *iter = seq->private;
3240 	struct tcp_iter_state *st = &iter->state;
3241 
3242 	if (st->state == TCP_SEQ_STATE_LISTENING)
3243 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3244 	else
3245 		return bpf_iter_tcp_established_batch(seq, start_sk);
3246 }
3247 
3248 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3249 {
3250 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3251 	struct bpf_tcp_iter_state *iter = seq->private;
3252 	struct tcp_iter_state *st = &iter->state;
3253 
3254 	if (st->state == TCP_SEQ_STATE_LISTENING)
3255 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3256 	else
3257 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3258 }
3259 
3260 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3261 {
3262 	struct bpf_tcp_iter_state *iter = seq->private;
3263 	unsigned int expected;
3264 	struct sock *sk;
3265 	int err;
3266 
3267 	sk = bpf_iter_tcp_resume(seq);
3268 	if (!sk)
3269 		return NULL; /* Done */
3270 
3271 	expected = bpf_iter_fill_batch(seq, &sk);
3272 	if (likely(iter->end_sk == expected))
3273 		goto done;
3274 
3275 	/* Batch size was too small. */
3276 	bpf_iter_tcp_unlock_bucket(seq);
3277 	bpf_iter_tcp_put_batch(iter);
3278 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3279 					 GFP_USER);
3280 	if (err)
3281 		return ERR_PTR(err);
3282 
3283 	sk = bpf_iter_tcp_resume(seq);
3284 	if (!sk)
3285 		return NULL; /* Done */
3286 
3287 	expected = bpf_iter_fill_batch(seq, &sk);
3288 	if (likely(iter->end_sk == expected))
3289 		goto done;
3290 
3291 	/* Batch size was still too small. Hold onto the lock while we try
3292 	 * again with a larger batch to make sure the current bucket's size
3293 	 * does not change in the meantime.
3294 	 */
3295 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3296 	if (err) {
3297 		bpf_iter_tcp_unlock_bucket(seq);
3298 		return ERR_PTR(err);
3299 	}
3300 
3301 	expected = bpf_iter_fill_batch(seq, &sk);
3302 	WARN_ON_ONCE(iter->end_sk != expected);
3303 done:
3304 	bpf_iter_tcp_unlock_bucket(seq);
3305 	return iter->batch[0].sk;
3306 }
3307 
3308 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3309 {
3310 	/* bpf iter does not support lseek, so it always
3311 	 * continue from where it was stop()-ped.
3312 	 */
3313 	if (*pos)
3314 		return bpf_iter_tcp_batch(seq);
3315 
3316 	return SEQ_START_TOKEN;
3317 }
3318 
3319 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3320 {
3321 	struct bpf_tcp_iter_state *iter = seq->private;
3322 	struct tcp_iter_state *st = &iter->state;
3323 	struct sock *sk;
3324 
3325 	/* Whenever seq_next() is called, the iter->cur_sk is
3326 	 * done with seq_show(), so advance to the next sk in
3327 	 * the batch.
3328 	 */
3329 	if (iter->cur_sk < iter->end_sk) {
3330 		/* Keeping st->num consistent in tcp_iter_state.
3331 		 * bpf_iter_tcp does not use st->num.
3332 		 * meta.seq_num is used instead.
3333 		 */
3334 		st->num++;
3335 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3336 	}
3337 
3338 	if (iter->cur_sk < iter->end_sk)
3339 		sk = iter->batch[iter->cur_sk].sk;
3340 	else
3341 		sk = bpf_iter_tcp_batch(seq);
3342 
3343 	++*pos;
3344 	/* Keeping st->last_pos consistent in tcp_iter_state.
3345 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3346 	 */
3347 	st->last_pos = *pos;
3348 	return sk;
3349 }
3350 
3351 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3352 {
3353 	struct bpf_iter_meta meta;
3354 	struct bpf_prog *prog;
3355 	struct sock *sk = v;
3356 	uid_t uid;
3357 	int ret;
3358 
3359 	if (v == SEQ_START_TOKEN)
3360 		return 0;
3361 
3362 	if (sk_fullsock(sk))
3363 		lock_sock(sk);
3364 
3365 	if (unlikely(sk_unhashed(sk))) {
3366 		ret = SEQ_SKIP;
3367 		goto unlock;
3368 	}
3369 
3370 	if (sk->sk_state == TCP_TIME_WAIT) {
3371 		uid = 0;
3372 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3373 		const struct request_sock *req = v;
3374 
3375 		uid = from_kuid_munged(seq_user_ns(seq),
3376 				       sk_uid(req->rsk_listener));
3377 	} else {
3378 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3379 	}
3380 
3381 	meta.seq = seq;
3382 	prog = bpf_iter_get_info(&meta, false);
3383 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3384 
3385 unlock:
3386 	if (sk_fullsock(sk))
3387 		release_sock(sk);
3388 	return ret;
3389 
3390 }
3391 
3392 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3393 {
3394 	struct bpf_tcp_iter_state *iter = seq->private;
3395 	struct bpf_iter_meta meta;
3396 	struct bpf_prog *prog;
3397 
3398 	if (!v) {
3399 		meta.seq = seq;
3400 		prog = bpf_iter_get_info(&meta, true);
3401 		if (prog)
3402 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3403 	}
3404 
3405 	if (iter->cur_sk < iter->end_sk)
3406 		bpf_iter_tcp_put_batch(iter);
3407 }
3408 
3409 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3410 	.show		= bpf_iter_tcp_seq_show,
3411 	.start		= bpf_iter_tcp_seq_start,
3412 	.next		= bpf_iter_tcp_seq_next,
3413 	.stop		= bpf_iter_tcp_seq_stop,
3414 };
3415 #endif
3416 static unsigned short seq_file_family(const struct seq_file *seq)
3417 {
3418 	const struct tcp_seq_afinfo *afinfo;
3419 
3420 #ifdef CONFIG_BPF_SYSCALL
3421 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3422 	if (seq->op == &bpf_iter_tcp_seq_ops)
3423 		return AF_UNSPEC;
3424 #endif
3425 
3426 	/* Iterated from proc fs */
3427 	afinfo = pde_data(file_inode(seq->file));
3428 	return afinfo->family;
3429 }
3430 
3431 static const struct seq_operations tcp4_seq_ops = {
3432 	.show		= tcp4_seq_show,
3433 	.start		= tcp_seq_start,
3434 	.next		= tcp_seq_next,
3435 	.stop		= tcp_seq_stop,
3436 };
3437 
3438 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3439 	.family		= AF_INET,
3440 };
3441 
3442 static int __net_init tcp4_proc_init_net(struct net *net)
3443 {
3444 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3445 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3446 		return -ENOMEM;
3447 	return 0;
3448 }
3449 
3450 static void __net_exit tcp4_proc_exit_net(struct net *net)
3451 {
3452 	remove_proc_entry("tcp", net->proc_net);
3453 }
3454 
3455 static struct pernet_operations tcp4_net_ops = {
3456 	.init = tcp4_proc_init_net,
3457 	.exit = tcp4_proc_exit_net,
3458 };
3459 
3460 int __init tcp4_proc_init(void)
3461 {
3462 	return register_pernet_subsys(&tcp4_net_ops);
3463 }
3464 
3465 void tcp4_proc_exit(void)
3466 {
3467 	unregister_pernet_subsys(&tcp4_net_ops);
3468 }
3469 #endif /* CONFIG_PROC_FS */
3470 
3471 /* @wake is one when sk_stream_write_space() calls us.
3472  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3473  * This mimics the strategy used in sock_def_write_space().
3474  */
3475 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3476 {
3477 	const struct tcp_sock *tp = tcp_sk(sk);
3478 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3479 			    READ_ONCE(tp->snd_nxt);
3480 
3481 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3482 }
3483 EXPORT_SYMBOL(tcp_stream_memory_free);
3484 
3485 struct proto tcp_prot = {
3486 	.name			= "TCP",
3487 	.owner			= THIS_MODULE,
3488 	.close			= tcp_close,
3489 	.pre_connect		= tcp_v4_pre_connect,
3490 	.connect		= tcp_v4_connect,
3491 	.disconnect		= tcp_disconnect,
3492 	.accept			= inet_csk_accept,
3493 	.ioctl			= tcp_ioctl,
3494 	.init			= tcp_v4_init_sock,
3495 	.destroy		= tcp_v4_destroy_sock,
3496 	.shutdown		= tcp_shutdown,
3497 	.setsockopt		= tcp_setsockopt,
3498 	.getsockopt		= tcp_getsockopt,
3499 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3500 	.keepalive		= tcp_set_keepalive,
3501 	.recvmsg		= tcp_recvmsg,
3502 	.sendmsg		= tcp_sendmsg,
3503 	.splice_eof		= tcp_splice_eof,
3504 	.backlog_rcv		= tcp_v4_do_rcv,
3505 	.release_cb		= tcp_release_cb,
3506 	.hash			= inet_hash,
3507 	.unhash			= inet_unhash,
3508 	.get_port		= inet_csk_get_port,
3509 	.put_port		= inet_put_port,
3510 #ifdef CONFIG_BPF_SYSCALL
3511 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3512 #endif
3513 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3514 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3515 	.stream_memory_free	= tcp_stream_memory_free,
3516 	.sockets_allocated	= &tcp_sockets_allocated,
3517 
3518 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3519 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3520 
3521 	.memory_pressure	= &tcp_memory_pressure,
3522 	.sysctl_mem		= sysctl_tcp_mem,
3523 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3524 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3525 	.max_header		= MAX_TCP_HEADER,
3526 	.obj_size		= sizeof(struct tcp_sock),
3527 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3528 	.twsk_prot		= &tcp_timewait_sock_ops,
3529 	.rsk_prot		= &tcp_request_sock_ops,
3530 	.h.hashinfo		= NULL,
3531 	.no_autobind		= true,
3532 	.diag_destroy		= tcp_abort,
3533 };
3534 EXPORT_SYMBOL(tcp_prot);
3535 
3536 static void __net_exit tcp_sk_exit(struct net *net)
3537 {
3538 	if (net->ipv4.tcp_congestion_control)
3539 		bpf_module_put(net->ipv4.tcp_congestion_control,
3540 			       net->ipv4.tcp_congestion_control->owner);
3541 }
3542 
3543 static void __net_init tcp_set_hashinfo(struct net *net)
3544 {
3545 	struct inet_hashinfo *hinfo;
3546 	unsigned int ehash_entries;
3547 	struct net *old_net;
3548 
3549 	if (net_eq(net, &init_net))
3550 		goto fallback;
3551 
3552 	old_net = current->nsproxy->net_ns;
3553 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3554 	if (!ehash_entries)
3555 		goto fallback;
3556 
3557 	ehash_entries = roundup_pow_of_two(ehash_entries);
3558 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3559 	if (!hinfo) {
3560 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3561 			"for a netns, fallback to the global one\n",
3562 			ehash_entries);
3563 fallback:
3564 		hinfo = &tcp_hashinfo;
3565 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3566 	}
3567 
3568 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3569 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3570 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3571 }
3572 
3573 static int __net_init tcp_sk_init(struct net *net)
3574 {
3575 	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3576 	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3577 	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3578 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3579 
3580 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3581 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3582 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3583 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3584 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3585 
3586 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3587 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3588 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3589 
3590 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3591 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3592 	net->ipv4.sysctl_tcp_syncookies = 1;
3593 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3594 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3595 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3596 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3597 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3598 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3599 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3600 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3601 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3602 
3603 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3604 	tcp_set_hashinfo(net);
3605 
3606 	net->ipv4.sysctl_tcp_sack = 1;
3607 	net->ipv4.sysctl_tcp_window_scaling = 1;
3608 	net->ipv4.sysctl_tcp_timestamps = 1;
3609 	net->ipv4.sysctl_tcp_early_retrans = 3;
3610 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3611 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3612 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3613 	net->ipv4.sysctl_tcp_max_reordering = 300;
3614 	net->ipv4.sysctl_tcp_dsack = 1;
3615 	net->ipv4.sysctl_tcp_app_win = 31;
3616 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3617 	net->ipv4.sysctl_tcp_frto = 2;
3618 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3619 	/* This limits the percentage of the congestion window which we
3620 	 * will allow a single TSO frame to consume.  Building TSO frames
3621 	 * which are too large can cause TCP streams to be bursty.
3622 	 */
3623 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3624 	/* Default TSQ limit of 4 MB */
3625 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3626 
3627 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3628 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3629 
3630 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3631 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3632 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3633 	net->ipv4.sysctl_tcp_autocorking = 1;
3634 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3635 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3636 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3637 	if (net != &init_net) {
3638 		memcpy(net->ipv4.sysctl_tcp_rmem,
3639 		       init_net.ipv4.sysctl_tcp_rmem,
3640 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3641 		memcpy(net->ipv4.sysctl_tcp_wmem,
3642 		       init_net.ipv4.sysctl_tcp_wmem,
3643 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3644 	}
3645 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3646 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3647 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3648 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3649 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3650 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3651 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3652 
3653 	/* Set default values for PLB */
3654 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3655 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3656 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3657 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3658 	/* Default congestion threshold for PLB to mark a round is 50% */
3659 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3660 
3661 	/* Reno is always built in */
3662 	if (!net_eq(net, &init_net) &&
3663 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3664 			       init_net.ipv4.tcp_congestion_control->owner))
3665 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3666 	else
3667 		net->ipv4.tcp_congestion_control = &tcp_reno;
3668 
3669 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3670 	net->ipv4.sysctl_tcp_shrink_window = 0;
3671 
3672 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3673 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3674 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3675 
3676 	return 0;
3677 }
3678 
3679 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3680 {
3681 	struct net *net;
3682 
3683 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3684 	 * and failed setup_net error unwinding path are serialized.
3685 	 *
3686 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3687 	 * net_exit_list, the thread that dismantles a particular twsk must
3688 	 * do so without other thread progressing to refcount_dec_and_test() of
3689 	 * tcp_death_row.tw_refcount.
3690 	 */
3691 	mutex_lock(&tcp_exit_batch_mutex);
3692 
3693 	tcp_twsk_purge(net_exit_list);
3694 
3695 	list_for_each_entry(net, net_exit_list, exit_list) {
3696 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3697 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3698 		tcp_fastopen_ctx_destroy(net);
3699 	}
3700 
3701 	mutex_unlock(&tcp_exit_batch_mutex);
3702 }
3703 
3704 static struct pernet_operations __net_initdata tcp_sk_ops = {
3705        .init	   = tcp_sk_init,
3706        .exit	   = tcp_sk_exit,
3707        .exit_batch = tcp_sk_exit_batch,
3708 };
3709 
3710 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3711 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3712 		     struct sock_common *sk_common, uid_t uid)
3713 
3714 #define INIT_BATCH_SZ 16
3715 
3716 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3717 {
3718 	struct bpf_tcp_iter_state *iter = priv_data;
3719 	int err;
3720 
3721 	err = bpf_iter_init_seq_net(priv_data, aux);
3722 	if (err)
3723 		return err;
3724 
3725 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3726 	if (err) {
3727 		bpf_iter_fini_seq_net(priv_data);
3728 		return err;
3729 	}
3730 
3731 	return 0;
3732 }
3733 
3734 static void bpf_iter_fini_tcp(void *priv_data)
3735 {
3736 	struct bpf_tcp_iter_state *iter = priv_data;
3737 
3738 	bpf_iter_fini_seq_net(priv_data);
3739 	kvfree(iter->batch);
3740 }
3741 
3742 static const struct bpf_iter_seq_info tcp_seq_info = {
3743 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3744 	.init_seq_private	= bpf_iter_init_tcp,
3745 	.fini_seq_private	= bpf_iter_fini_tcp,
3746 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3747 };
3748 
3749 static const struct bpf_func_proto *
3750 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3751 			    const struct bpf_prog *prog)
3752 {
3753 	switch (func_id) {
3754 	case BPF_FUNC_setsockopt:
3755 		return &bpf_sk_setsockopt_proto;
3756 	case BPF_FUNC_getsockopt:
3757 		return &bpf_sk_getsockopt_proto;
3758 	default:
3759 		return NULL;
3760 	}
3761 }
3762 
3763 static struct bpf_iter_reg tcp_reg_info = {
3764 	.target			= "tcp",
3765 	.ctx_arg_info_size	= 1,
3766 	.ctx_arg_info		= {
3767 		{ offsetof(struct bpf_iter__tcp, sk_common),
3768 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3769 	},
3770 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3771 	.seq_info		= &tcp_seq_info,
3772 };
3773 
3774 static void __init bpf_iter_register(void)
3775 {
3776 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3777 	if (bpf_iter_reg_target(&tcp_reg_info))
3778 		pr_warn("Warning: could not register bpf iterator tcp\n");
3779 }
3780 
3781 #endif
3782 
3783 void __init tcp_v4_init(void)
3784 {
3785 	int cpu, res;
3786 
3787 	for_each_possible_cpu(cpu) {
3788 		struct sock *sk;
3789 
3790 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3791 					   IPPROTO_TCP, &init_net);
3792 		if (res)
3793 			panic("Failed to create the TCP control socket.\n");
3794 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3795 
3796 		/* Please enforce IP_DF and IPID==0 for RST and
3797 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3798 		 */
3799 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3800 
3801 		sk->sk_clockid = CLOCK_MONOTONIC;
3802 
3803 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3804 	}
3805 	if (register_pernet_subsys(&tcp_sk_ops))
3806 		panic("Failed to create the TCP control socket.\n");
3807 
3808 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3809 	bpf_iter_register();
3810 #endif
3811 }
3812