xref: /linux/net/ipv4/tcp_ipv4.c (revision abacaf559950eec0d99d37ff6b92049409af5943)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/fips.h>
57 #include <linux/jhash.h>
58 #include <linux/init.h>
59 #include <linux/times.h>
60 #include <linux/slab.h>
61 #include <linux/sched.h>
62 #include <linux/sock_diag.h>
63 
64 #include <net/aligned_data.h>
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/tcp_ecn.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/inet_ecn.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/busy_poll.h>
78 #include <net/rstreason.h>
79 #include <net/psp.h>
80 
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86 #include <linux/inetdevice.h>
87 #include <linux/btf_ids.h>
88 #include <linux/skbuff_ref.h>
89 
90 #include <crypto/md5.h>
91 #include <crypto/utils.h>
92 
93 #include <trace/events/tcp.h>
94 
95 #ifdef CONFIG_TCP_MD5SIG
96 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
97 				__be32 daddr, __be32 saddr, const struct tcphdr *th);
98 #endif
99 
100 struct inet_hashinfo tcp_hashinfo;
101 
102 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
103 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
104 };
105 
106 static DEFINE_MUTEX(tcp_exit_batch_mutex);
107 
108 static union tcp_seq_and_ts_off
tcp_v4_init_seq_and_ts_off(const struct net * net,const struct sk_buff * skb)109 tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
110 {
111 	return secure_tcp_seq_and_ts_off(net,
112 					 ip_hdr(skb)->daddr,
113 					 ip_hdr(skb)->saddr,
114 					 tcp_hdr(skb)->dest,
115 					 tcp_hdr(skb)->source);
116 }
117 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
119 {
120 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
121 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
122 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
123 	struct tcp_sock *tp = tcp_sk(sk);
124 	int ts_recent_stamp;
125 	u32 reuse_thresh;
126 
127 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
128 		reuse = 0;
129 
130 	if (reuse == 2) {
131 		/* Still does not detect *everything* that goes through
132 		 * lo, since we require a loopback src or dst address
133 		 * or direct binding to 'lo' interface.
134 		 */
135 		bool loopback = false;
136 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
137 			loopback = true;
138 #if IS_ENABLED(CONFIG_IPV6)
139 		if (tw->tw_family == AF_INET6) {
140 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
141 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
142 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
143 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
144 				loopback = true;
145 		} else
146 #endif
147 		{
148 			if (ipv4_is_loopback(tw->tw_daddr) ||
149 			    ipv4_is_loopback(tw->tw_rcv_saddr))
150 				loopback = true;
151 		}
152 		if (!loopback)
153 			reuse = 0;
154 	}
155 
156 	/* With PAWS, it is safe from the viewpoint
157 	   of data integrity. Even without PAWS it is safe provided sequence
158 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
159 
160 	   Actually, the idea is close to VJ's one, only timestamp cache is
161 	   held not per host, but per port pair and TW bucket is used as state
162 	   holder.
163 
164 	   If TW bucket has been already destroyed we fall back to VJ's scheme
165 	   and use initial timestamp retrieved from peer table.
166 	 */
167 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
168 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
169 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
170 	if (ts_recent_stamp &&
171 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
172 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
173 		 * and releasing the bucket lock.
174 		 */
175 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
176 			return 0;
177 
178 		/* In case of repair and re-using TIME-WAIT sockets we still
179 		 * want to be sure that it is safe as above but honor the
180 		 * sequence numbers and time stamps set as part of the repair
181 		 * process.
182 		 *
183 		 * Without this check re-using a TIME-WAIT socket with TCP
184 		 * repair would accumulate a -1 on the repair assigned
185 		 * sequence number. The first time it is reused the sequence
186 		 * is -1, the second time -2, etc. This fixes that issue
187 		 * without appearing to create any others.
188 		 */
189 		if (likely(!tp->repair)) {
190 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
191 
192 			if (!seq)
193 				seq = 1;
194 			WRITE_ONCE(tp->write_seq, seq);
195 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
196 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
197 		}
198 
199 		return 1;
200 	}
201 
202 	return 0;
203 }
204 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
205 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr_unsized * uaddr,int addr_len)206 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
207 			      int addr_len)
208 {
209 	/* This check is replicated from tcp_v4_connect() and intended to
210 	 * prevent BPF program called below from accessing bytes that are out
211 	 * of the bound specified by user in addr_len.
212 	 */
213 	if (addr_len < sizeof(struct sockaddr_in))
214 		return -EINVAL;
215 
216 	sock_owned_by_me(sk);
217 
218 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
219 }
220 
221 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr_unsized * uaddr,int addr_len)222 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
223 {
224 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
225 	struct inet_timewait_death_row *tcp_death_row;
226 	struct inet_sock *inet = inet_sk(sk);
227 	struct tcp_sock *tp = tcp_sk(sk);
228 	struct ip_options_rcu *inet_opt;
229 	struct net *net = sock_net(sk);
230 	__be16 orig_sport, orig_dport;
231 	__be32 daddr, nexthop;
232 	struct flowi4 *fl4;
233 	struct rtable *rt;
234 	int err;
235 
236 	if (addr_len < sizeof(struct sockaddr_in))
237 		return -EINVAL;
238 
239 	if (usin->sin_family != AF_INET)
240 		return -EAFNOSUPPORT;
241 
242 	nexthop = daddr = usin->sin_addr.s_addr;
243 	inet_opt = rcu_dereference_protected(inet->inet_opt,
244 					     lockdep_sock_is_held(sk));
245 	if (inet_opt && inet_opt->opt.srr) {
246 		if (!daddr)
247 			return -EINVAL;
248 		nexthop = inet_opt->opt.faddr;
249 	}
250 
251 	orig_sport = inet->inet_sport;
252 	orig_dport = usin->sin_port;
253 	fl4 = &inet->cork.fl.u.ip4;
254 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
255 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
256 			      orig_dport, sk);
257 	if (IS_ERR(rt)) {
258 		err = PTR_ERR(rt);
259 		if (err == -ENETUNREACH)
260 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
261 		return err;
262 	}
263 
264 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
265 		ip_rt_put(rt);
266 		return -ENETUNREACH;
267 	}
268 
269 	if (!inet_opt || !inet_opt->opt.srr)
270 		daddr = fl4->daddr;
271 
272 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
273 
274 	if (!inet->inet_saddr) {
275 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
276 		if (err) {
277 			ip_rt_put(rt);
278 			return err;
279 		}
280 	} else {
281 		sk_rcv_saddr_set(sk, inet->inet_saddr);
282 	}
283 
284 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
285 		/* Reset inherited state */
286 		tp->rx_opt.ts_recent	   = 0;
287 		tp->rx_opt.ts_recent_stamp = 0;
288 		if (likely(!tp->repair))
289 			WRITE_ONCE(tp->write_seq, 0);
290 	}
291 
292 	inet->inet_dport = usin->sin_port;
293 	sk_daddr_set(sk, daddr);
294 
295 	inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
296 	if (inet_opt)
297 		inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
298 
299 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
300 
301 	/* Socket identity is still unknown (sport may be zero).
302 	 * However we set state to SYN-SENT and not releasing socket
303 	 * lock select source port, enter ourselves into the hash tables and
304 	 * complete initialization after this.
305 	 */
306 	tcp_set_state(sk, TCP_SYN_SENT);
307 	err = inet_hash_connect(tcp_death_row, sk);
308 	if (err)
309 		goto failure;
310 
311 	sk_set_txhash(sk);
312 
313 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
314 			       inet->inet_sport, inet->inet_dport, sk);
315 	if (IS_ERR(rt)) {
316 		err = PTR_ERR(rt);
317 		rt = NULL;
318 		goto failure;
319 	}
320 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
321 	/* OK, now commit destination to socket.  */
322 	sk->sk_gso_type = SKB_GSO_TCPV4;
323 	sk_setup_caps(sk, &rt->dst);
324 	rt = NULL;
325 
326 	if (likely(!tp->repair)) {
327 		union tcp_seq_and_ts_off st;
328 
329 		st = secure_tcp_seq_and_ts_off(net,
330 					       inet->inet_saddr,
331 					       inet->inet_daddr,
332 					       inet->inet_sport,
333 					       usin->sin_port);
334 		if (!tp->write_seq)
335 			WRITE_ONCE(tp->write_seq, st.seq);
336 		WRITE_ONCE(tp->tsoffset, st.ts_off);
337 	}
338 
339 	atomic_set(&inet->inet_id, get_random_u16());
340 
341 	if (tcp_fastopen_defer_connect(sk, &err))
342 		return err;
343 	if (err)
344 		goto failure;
345 
346 	err = tcp_connect(sk);
347 
348 	if (err)
349 		goto failure;
350 
351 	return 0;
352 
353 failure:
354 	/*
355 	 * This unhashes the socket and releases the local port,
356 	 * if necessary.
357 	 */
358 	tcp_set_state(sk, TCP_CLOSE);
359 	inet_bhash2_reset_saddr(sk);
360 	ip_rt_put(rt);
361 	sk->sk_route_caps = 0;
362 	inet->inet_dport = 0;
363 	return err;
364 }
365 EXPORT_IPV6_MOD(tcp_v4_connect);
366 
367 /*
368  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
369  * It can be called through tcp_release_cb() if socket was owned by user
370  * at the time tcp_v4_err() was called to handle ICMP message.
371  */
tcp_v4_mtu_reduced(struct sock * sk)372 void tcp_v4_mtu_reduced(struct sock *sk)
373 {
374 	struct inet_sock *inet = inet_sk(sk);
375 	struct dst_entry *dst;
376 	u32 mtu, dmtu;
377 
378 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
379 		return;
380 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
381 	dst = inet_csk_update_pmtu(sk, mtu);
382 	if (!dst)
383 		return;
384 
385 	/* Something is about to be wrong... Remember soft error
386 	 * for the case, if this connection will not able to recover.
387 	 */
388 	dmtu = dst4_mtu(dst);
389 	if (mtu < dmtu && ip_dont_fragment(sk, dst))
390 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
391 
392 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
393 	    ip_sk_accept_pmtu(sk) &&
394 	    inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
395 		tcp_sync_mss(sk, dmtu);
396 
397 		/* Resend the TCP packet because it's
398 		 * clear that the old packet has been
399 		 * dropped. This is the new "fast" path mtu
400 		 * discovery.
401 		 */
402 		tcp_simple_retransmit(sk);
403 	} /* else let the usual retransmit timer handle it */
404 }
405 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
406 
do_redirect(struct sk_buff * skb,struct sock * sk)407 static void do_redirect(struct sk_buff *skb, struct sock *sk)
408 {
409 	struct dst_entry *dst = __sk_dst_check(sk, 0);
410 
411 	if (dst)
412 		dst->ops->redirect(dst, sk, skb);
413 }
414 
415 
416 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)417 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
418 {
419 	struct request_sock *req = inet_reqsk(sk);
420 	struct net *net = sock_net(sk);
421 
422 	/* ICMPs are not backlogged, hence we cannot get
423 	 * an established socket here.
424 	 */
425 	if (seq != tcp_rsk(req)->snt_isn) {
426 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
427 	} else if (abort) {
428 		/*
429 		 * Still in SYN_RECV, just remove it silently.
430 		 * There is no good way to pass the error to the newly
431 		 * created socket, and POSIX does not want network
432 		 * errors returned from accept().
433 		 */
434 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
435 		tcp_listendrop(req->rsk_listener);
436 	}
437 	reqsk_put(req);
438 }
439 EXPORT_IPV6_MOD(tcp_req_err);
440 
441 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)442 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
443 {
444 	struct inet_connection_sock *icsk = inet_csk(sk);
445 	struct tcp_sock *tp = tcp_sk(sk);
446 	struct sk_buff *skb;
447 	s32 remaining;
448 	u32 delta_us;
449 
450 	if (sock_owned_by_user(sk))
451 		return;
452 
453 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
454 	    !icsk->icsk_backoff)
455 		return;
456 
457 	skb = tcp_rtx_queue_head(sk);
458 	if (WARN_ON_ONCE(!skb))
459 		return;
460 
461 	icsk->icsk_backoff--;
462 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
463 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
464 
465 	tcp_mstamp_refresh(tp);
466 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
467 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
468 
469 	if (remaining > 0) {
470 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
471 	} else {
472 		/* RTO revert clocked out retransmission.
473 		 * Will retransmit now.
474 		 */
475 		tcp_retransmit_timer(sk);
476 	}
477 }
478 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
479 
480 /*
481  * This routine is called by the ICMP module when it gets some
482  * sort of error condition.  If err < 0 then the socket should
483  * be closed and the error returned to the user.  If err > 0
484  * it's just the icmp type << 8 | icmp code.  After adjustment
485  * header points to the first 8 bytes of the tcp header.  We need
486  * to find the appropriate port.
487  *
488  * The locking strategy used here is very "optimistic". When
489  * someone else accesses the socket the ICMP is just dropped
490  * and for some paths there is no check at all.
491  * A more general error queue to queue errors for later handling
492  * is probably better.
493  *
494  */
495 
tcp_v4_err(struct sk_buff * skb,u32 info)496 int tcp_v4_err(struct sk_buff *skb, u32 info)
497 {
498 	const struct iphdr *iph = (const struct iphdr *)skb->data;
499 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
500 	struct net *net = dev_net_rcu(skb->dev);
501 	const int type = icmp_hdr(skb)->type;
502 	const int code = icmp_hdr(skb)->code;
503 	struct request_sock *fastopen;
504 	struct tcp_sock *tp;
505 	u32 seq, snd_una;
506 	struct sock *sk;
507 	int err;
508 
509 	sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
510 				       ntohs(th->source), inet_iif(skb), 0);
511 	if (!sk) {
512 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
513 		return -ENOENT;
514 	}
515 	if (sk->sk_state == TCP_TIME_WAIT) {
516 		/* To increase the counter of ignored icmps for TCP-AO */
517 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
518 		inet_twsk_put(inet_twsk(sk));
519 		return 0;
520 	}
521 	seq = ntohl(th->seq);
522 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
523 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
524 				     type == ICMP_TIME_EXCEEDED ||
525 				     (type == ICMP_DEST_UNREACH &&
526 				      (code == ICMP_NET_UNREACH ||
527 				       code == ICMP_HOST_UNREACH)));
528 		return 0;
529 	}
530 
531 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
532 		sock_put(sk);
533 		return 0;
534 	}
535 
536 	bh_lock_sock(sk);
537 	/* If too many ICMPs get dropped on busy
538 	 * servers this needs to be solved differently.
539 	 * We do take care of PMTU discovery (RFC1191) special case :
540 	 * we can receive locally generated ICMP messages while socket is held.
541 	 */
542 	if (sock_owned_by_user(sk)) {
543 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
544 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
545 	}
546 	if (sk->sk_state == TCP_CLOSE)
547 		goto out;
548 
549 	if (static_branch_unlikely(&ip4_min_ttl)) {
550 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
551 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
552 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
553 			goto out;
554 		}
555 	}
556 
557 	tp = tcp_sk(sk);
558 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
559 	fastopen = rcu_dereference(tp->fastopen_rsk);
560 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
561 	if (sk->sk_state != TCP_LISTEN &&
562 	    !between(seq, snd_una, tp->snd_nxt)) {
563 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
564 		goto out;
565 	}
566 
567 	switch (type) {
568 	case ICMP_REDIRECT:
569 		if (!sock_owned_by_user(sk))
570 			do_redirect(skb, sk);
571 		goto out;
572 	case ICMP_SOURCE_QUENCH:
573 		/* Just silently ignore these. */
574 		goto out;
575 	case ICMP_PARAMETERPROB:
576 		err = EPROTO;
577 		break;
578 	case ICMP_DEST_UNREACH:
579 		if (code > NR_ICMP_UNREACH)
580 			goto out;
581 
582 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
583 			/* We are not interested in TCP_LISTEN and open_requests
584 			 * (SYN-ACKs send out by Linux are always <576bytes so
585 			 * they should go through unfragmented).
586 			 */
587 			if (sk->sk_state == TCP_LISTEN)
588 				goto out;
589 
590 			WRITE_ONCE(tp->mtu_info, info);
591 			if (!sock_owned_by_user(sk)) {
592 				tcp_v4_mtu_reduced(sk);
593 			} else {
594 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
595 					sock_hold(sk);
596 			}
597 			goto out;
598 		}
599 
600 		err = icmp_err_convert[code].errno;
601 		/* check if this ICMP message allows revert of backoff.
602 		 * (see RFC 6069)
603 		 */
604 		if (!fastopen &&
605 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
606 			tcp_ld_RTO_revert(sk, seq);
607 		break;
608 	case ICMP_TIME_EXCEEDED:
609 		err = EHOSTUNREACH;
610 		break;
611 	default:
612 		goto out;
613 	}
614 
615 	switch (sk->sk_state) {
616 	case TCP_SYN_SENT:
617 	case TCP_SYN_RECV:
618 		/* Only in fast or simultaneous open. If a fast open socket is
619 		 * already accepted it is treated as a connected one below.
620 		 */
621 		if (fastopen && !fastopen->sk)
622 			break;
623 
624 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
625 
626 		if (!sock_owned_by_user(sk))
627 			tcp_done_with_error(sk, err);
628 		else
629 			WRITE_ONCE(sk->sk_err_soft, err);
630 		goto out;
631 	}
632 
633 	/* If we've already connected we will keep trying
634 	 * until we time out, or the user gives up.
635 	 *
636 	 * rfc1122 4.2.3.9 allows to consider as hard errors
637 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
638 	 * but it is obsoleted by pmtu discovery).
639 	 *
640 	 * Note, that in modern internet, where routing is unreliable
641 	 * and in each dark corner broken firewalls sit, sending random
642 	 * errors ordered by their masters even this two messages finally lose
643 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
644 	 *
645 	 * Now we are in compliance with RFCs.
646 	 *							--ANK (980905)
647 	 */
648 
649 	if (!sock_owned_by_user(sk) &&
650 	    inet_test_bit(RECVERR, sk)) {
651 		WRITE_ONCE(sk->sk_err, err);
652 		sk_error_report(sk);
653 	} else	{ /* Only an error on timeout */
654 		WRITE_ONCE(sk->sk_err_soft, err);
655 	}
656 
657 out:
658 	bh_unlock_sock(sk);
659 	sock_put(sk);
660 	return 0;
661 }
662 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)663 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
664 {
665 	struct tcphdr *th = tcp_hdr(skb);
666 
667 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
668 	skb->csum_start = skb_transport_header(skb) - skb->head;
669 	skb->csum_offset = offsetof(struct tcphdr, check);
670 }
671 
672 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)673 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
674 {
675 	const struct inet_sock *inet = inet_sk(sk);
676 
677 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
678 }
679 EXPORT_IPV6_MOD(tcp_v4_send_check);
680 
681 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
682 
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])683 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
684 				 const struct tcp_ao_hdr *aoh,
685 				 struct ip_reply_arg *arg, struct tcphdr *reply,
686 				 __be32 reply_options[REPLY_OPTIONS_LEN])
687 {
688 #ifdef CONFIG_TCP_AO
689 	int sdif = tcp_v4_sdif(skb);
690 	int dif = inet_iif(skb);
691 	int l3index = sdif ? dif : 0;
692 	bool allocated_traffic_key;
693 	struct tcp_ao_key *key;
694 	char *traffic_key;
695 	bool drop = true;
696 	u32 ao_sne = 0;
697 	u8 keyid;
698 
699 	rcu_read_lock();
700 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
701 				 &key, &traffic_key, &allocated_traffic_key,
702 				 &keyid, &ao_sne))
703 		goto out;
704 
705 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
706 				 (aoh->rnext_keyid << 8) | keyid);
707 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
708 	reply->doff = arg->iov[0].iov_len / 4;
709 
710 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
711 			    key, traffic_key,
712 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
713 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
714 			    reply, ao_sne))
715 		goto out;
716 	drop = false;
717 out:
718 	rcu_read_unlock();
719 	if (allocated_traffic_key)
720 		kfree(traffic_key);
721 	return drop;
722 #else
723 	return true;
724 #endif
725 }
726 
727 /*
728  *	This routine will send an RST to the other tcp.
729  *
730  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
731  *		      for reset.
732  *	Answer: if a packet caused RST, it is not for a socket
733  *		existing in our system, if it is matched to a socket,
734  *		it is just duplicate segment or bug in other side's TCP.
735  *		So that we build reply only basing on parameters
736  *		arrived with segment.
737  *	Exception: precedence violation. We do not implement it in any case.
738  */
739 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)740 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
741 			      enum sk_rst_reason reason)
742 {
743 	const struct tcphdr *th = tcp_hdr(skb);
744 	struct {
745 		struct tcphdr th;
746 		__be32 opt[REPLY_OPTIONS_LEN];
747 	} rep;
748 	const __u8 *md5_hash_location = NULL;
749 	const struct tcp_ao_hdr *aoh;
750 	struct ip_reply_arg arg;
751 #ifdef CONFIG_TCP_MD5SIG
752 	struct tcp_md5sig_key *key = NULL;
753 	unsigned char newhash[16];
754 	struct sock *sk1 = NULL;
755 #endif
756 	u64 transmit_time = 0;
757 	struct sock *ctl_sk;
758 	struct net *net;
759 	u32 txhash = 0;
760 
761 	/* Never send a reset in response to a reset. */
762 	if (th->rst)
763 		return;
764 
765 	/* If sk not NULL, it means we did a successful lookup and incoming
766 	 * route had to be correct. prequeue might have dropped our dst.
767 	 */
768 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
769 		return;
770 
771 	/* Swap the send and the receive. */
772 	memset(&rep, 0, sizeof(rep));
773 	rep.th.dest   = th->source;
774 	rep.th.source = th->dest;
775 	rep.th.doff   = sizeof(struct tcphdr) / 4;
776 	rep.th.rst    = 1;
777 
778 	if (th->ack) {
779 		rep.th.seq = th->ack_seq;
780 	} else {
781 		rep.th.ack = 1;
782 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
783 				       skb->len - (th->doff << 2));
784 	}
785 
786 	memset(&arg, 0, sizeof(arg));
787 	arg.iov[0].iov_base = (unsigned char *)&rep;
788 	arg.iov[0].iov_len  = sizeof(rep.th);
789 
790 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
791 
792 	/* Invalid TCP option size or twice included auth */
793 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
794 		return;
795 
796 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
797 		return;
798 
799 #ifdef CONFIG_TCP_MD5SIG
800 	rcu_read_lock();
801 	if (sk && sk_fullsock(sk)) {
802 		const union tcp_md5_addr *addr;
803 		int l3index;
804 
805 		/* sdif set, means packet ingressed via a device
806 		 * in an L3 domain and inet_iif is set to it.
807 		 */
808 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
809 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
810 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
811 	} else if (md5_hash_location) {
812 		const union tcp_md5_addr *addr;
813 		int sdif = tcp_v4_sdif(skb);
814 		int dif = inet_iif(skb);
815 		int l3index;
816 
817 		/*
818 		 * active side is lost. Try to find listening socket through
819 		 * source port, and then find md5 key through listening socket.
820 		 * we are not loose security here:
821 		 * Incoming packet is checked with md5 hash with finding key,
822 		 * no RST generated if md5 hash doesn't match.
823 		 */
824 		sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
825 					     th->source, ip_hdr(skb)->daddr,
826 					     ntohs(th->source), dif, sdif);
827 		/* don't send rst if it can't find key */
828 		if (!sk1)
829 			goto out;
830 
831 		/* sdif set, means packet ingressed via a device
832 		 * in an L3 domain and dif is set to it.
833 		 */
834 		l3index = sdif ? dif : 0;
835 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
836 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
837 		if (!key)
838 			goto out;
839 
840 		tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
841 		if (crypto_memneq(md5_hash_location, newhash, 16))
842 			goto out;
843 	}
844 
845 	if (key) {
846 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
847 				   (TCPOPT_NOP << 16) |
848 				   (TCPOPT_MD5SIG << 8) |
849 				   TCPOLEN_MD5SIG);
850 		/* Update length and the length the header thinks exists */
851 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
852 		rep.th.doff = arg.iov[0].iov_len / 4;
853 
854 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
855 				     key, ip_hdr(skb)->saddr,
856 				     ip_hdr(skb)->daddr, &rep.th);
857 	}
858 #endif
859 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
860 	if (rep.opt[0] == 0) {
861 		__be32 mrst = mptcp_reset_option(skb);
862 
863 		if (mrst) {
864 			rep.opt[0] = mrst;
865 			arg.iov[0].iov_len += sizeof(mrst);
866 			rep.th.doff = arg.iov[0].iov_len / 4;
867 		}
868 	}
869 
870 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
871 				      ip_hdr(skb)->saddr, /* XXX */
872 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
873 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
874 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
875 
876 	/* When socket is gone, all binding information is lost.
877 	 * routing might fail in this case. No choice here, if we choose to force
878 	 * input interface, we will misroute in case of asymmetric route.
879 	 */
880 	if (sk)
881 		arg.bound_dev_if = sk->sk_bound_dev_if;
882 
883 	trace_tcp_send_reset(sk, skb, reason);
884 
885 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
886 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
887 
888 	/* ECN bits of TW reset are cleared */
889 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
890 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
891 	local_bh_disable();
892 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
893 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
894 
895 	sock_net_set(ctl_sk, net);
896 	if (sk) {
897 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
898 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
899 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
900 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
901 		transmit_time = tcp_transmit_time(sk);
902 		xfrm_sk_clone_policy(ctl_sk, sk);
903 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
904 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
905 	} else {
906 		ctl_sk->sk_mark = 0;
907 		ctl_sk->sk_priority = 0;
908 	}
909 	ip_send_unicast_reply(ctl_sk, sk,
910 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
911 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
912 			      &arg, arg.iov[0].iov_len,
913 			      transmit_time, txhash);
914 
915 	xfrm_sk_free_policy(ctl_sk);
916 	sock_net_set(ctl_sk, &init_net);
917 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
918 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
919 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
920 	local_bh_enable();
921 
922 #ifdef CONFIG_TCP_MD5SIG
923 out:
924 	rcu_read_unlock();
925 #endif
926 }
927 
928 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
929    outside socket context is ugly, certainly. What can I do?
930  */
931 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)932 static void tcp_v4_send_ack(const struct sock *sk,
933 			    struct sk_buff *skb, u32 seq, u32 ack,
934 			    u32 win, u32 tsval, u32 tsecr, int oif,
935 			    struct tcp_key *key,
936 			    int reply_flags, u8 tos, u32 txhash)
937 {
938 	const struct tcphdr *th = tcp_hdr(skb);
939 	struct {
940 		struct tcphdr th;
941 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
942 	} rep;
943 	struct net *net = sock_net(sk);
944 	struct ip_reply_arg arg;
945 	struct sock *ctl_sk;
946 	u64 transmit_time;
947 
948 	memset(&rep.th, 0, sizeof(struct tcphdr));
949 	memset(&arg, 0, sizeof(arg));
950 
951 	arg.iov[0].iov_base = (unsigned char *)&rep;
952 	arg.iov[0].iov_len  = sizeof(rep.th);
953 	if (tsecr) {
954 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
955 				   (TCPOPT_TIMESTAMP << 8) |
956 				   TCPOLEN_TIMESTAMP);
957 		rep.opt[1] = htonl(tsval);
958 		rep.opt[2] = htonl(tsecr);
959 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
960 	}
961 
962 	/* Swap the send and the receive. */
963 	rep.th.dest    = th->source;
964 	rep.th.source  = th->dest;
965 	rep.th.doff    = arg.iov[0].iov_len / 4;
966 	rep.th.seq     = htonl(seq);
967 	rep.th.ack_seq = htonl(ack);
968 	rep.th.ack     = 1;
969 	rep.th.window  = htons(win);
970 
971 #ifdef CONFIG_TCP_MD5SIG
972 	if (tcp_key_is_md5(key)) {
973 		int offset = (tsecr) ? 3 : 0;
974 
975 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
976 					  (TCPOPT_NOP << 16) |
977 					  (TCPOPT_MD5SIG << 8) |
978 					  TCPOLEN_MD5SIG);
979 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
980 		rep.th.doff = arg.iov[0].iov_len/4;
981 
982 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
983 				    key->md5_key, ip_hdr(skb)->saddr,
984 				    ip_hdr(skb)->daddr, &rep.th);
985 	}
986 #endif
987 #ifdef CONFIG_TCP_AO
988 	if (tcp_key_is_ao(key)) {
989 		int offset = (tsecr) ? 3 : 0;
990 
991 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
992 					  (tcp_ao_len(key->ao_key) << 16) |
993 					  (key->ao_key->sndid << 8) |
994 					  key->rcv_next);
995 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
996 		rep.th.doff = arg.iov[0].iov_len / 4;
997 
998 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
999 				key->ao_key, key->traffic_key,
1000 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1001 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1002 				&rep.th, key->sne);
1003 	}
1004 #endif
1005 	arg.flags = reply_flags;
1006 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1007 				      ip_hdr(skb)->saddr, /* XXX */
1008 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1009 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1010 	if (oif)
1011 		arg.bound_dev_if = oif;
1012 	arg.tos = tos;
1013 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1014 	local_bh_disable();
1015 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1016 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1017 	sock_net_set(ctl_sk, net);
1018 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1019 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1020 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1021 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1022 	transmit_time = tcp_transmit_time(sk);
1023 	ip_send_unicast_reply(ctl_sk, sk,
1024 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1025 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1026 			      &arg, arg.iov[0].iov_len,
1027 			      transmit_time, txhash);
1028 
1029 	sock_net_set(ctl_sk, &init_net);
1030 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1031 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1032 	local_bh_enable();
1033 }
1034 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb,enum tcp_tw_status tw_status)1035 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1036 				enum tcp_tw_status tw_status)
1037 {
1038 	struct inet_timewait_sock *tw = inet_twsk(sk);
1039 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1040 	struct tcp_key key = {};
1041 	u8 tos = tw->tw_tos;
1042 
1043 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1044 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1045 	 * being placed in a different service queues (Classic rather than L4S)
1046 	 */
1047 	if (tw_status == TCP_TW_ACK_OOW)
1048 		tos &= ~INET_ECN_MASK;
1049 
1050 #ifdef CONFIG_TCP_AO
1051 	struct tcp_ao_info *ao_info;
1052 
1053 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1054 		/* FIXME: the segment to-be-acked is not verified yet */
1055 		ao_info = rcu_dereference(tcptw->ao_info);
1056 		if (ao_info) {
1057 			const struct tcp_ao_hdr *aoh;
1058 
1059 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1060 				inet_twsk_put(tw);
1061 				return;
1062 			}
1063 
1064 			if (aoh)
1065 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1066 								    aoh->rnext_keyid, -1);
1067 		}
1068 	}
1069 	if (key.ao_key) {
1070 		struct tcp_ao_key *rnext_key;
1071 
1072 		key.traffic_key = snd_other_key(key.ao_key);
1073 		key.sne = READ_ONCE(ao_info->snd_sne);
1074 		rnext_key = READ_ONCE(ao_info->rnext_key);
1075 		key.rcv_next = rnext_key->rcvid;
1076 		key.type = TCP_KEY_AO;
1077 #else
1078 	if (0) {
1079 #endif
1080 	} else if (static_branch_tcp_md5()) {
1081 		key.md5_key = tcp_twsk_md5_key(tcptw);
1082 		if (key.md5_key)
1083 			key.type = TCP_KEY_MD5;
1084 	}
1085 
1086 	tcp_v4_send_ack(sk, skb,
1087 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1088 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1089 			tcp_tw_tsval(tcptw),
1090 			READ_ONCE(tcptw->tw_ts_recent),
1091 			tw->tw_bound_dev_if, &key,
1092 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1093 			tos,
1094 			tw->tw_txhash);
1095 
1096 	inet_twsk_put(tw);
1097 }
1098 
1099 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1100 				  struct request_sock *req)
1101 {
1102 	struct tcp_key key = {};
1103 
1104 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1105 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1106 	 */
1107 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1108 					     tcp_sk(sk)->snd_nxt;
1109 
1110 #ifdef CONFIG_TCP_AO
1111 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1112 	    tcp_rsk_used_ao(req)) {
1113 		const union tcp_md5_addr *addr;
1114 		const struct tcp_ao_hdr *aoh;
1115 		int l3index;
1116 
1117 		/* Invalid TCP option size or twice included auth */
1118 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1119 			return;
1120 		if (!aoh)
1121 			return;
1122 
1123 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1124 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1125 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1126 					      aoh->rnext_keyid, -1);
1127 		if (unlikely(!key.ao_key)) {
1128 			/* Send ACK with any matching MKT for the peer */
1129 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1130 			/* Matching key disappeared (user removed the key?)
1131 			 * let the handshake timeout.
1132 			 */
1133 			if (!key.ao_key) {
1134 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1135 						     addr,
1136 						     ntohs(tcp_hdr(skb)->source),
1137 						     &ip_hdr(skb)->daddr,
1138 						     ntohs(tcp_hdr(skb)->dest));
1139 				return;
1140 			}
1141 		}
1142 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1143 		if (!key.traffic_key)
1144 			return;
1145 
1146 		key.type = TCP_KEY_AO;
1147 		key.rcv_next = aoh->keyid;
1148 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1149 #else
1150 	if (0) {
1151 #endif
1152 	} else if (static_branch_tcp_md5()) {
1153 		const union tcp_md5_addr *addr;
1154 		int l3index;
1155 
1156 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1157 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1158 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1159 		if (key.md5_key)
1160 			key.type = TCP_KEY_MD5;
1161 	}
1162 
1163 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1164 	tcp_v4_send_ack(sk, skb, seq,
1165 			tcp_rsk(req)->rcv_nxt,
1166 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1167 			tcp_rsk_tsval(tcp_rsk(req)),
1168 			req->ts_recent,
1169 			0, &key,
1170 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1171 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1172 			READ_ONCE(tcp_rsk(req)->txhash));
1173 	if (tcp_key_is_ao(&key))
1174 		kfree(key.traffic_key);
1175 }
1176 
1177 /*
1178  *	Send a SYN-ACK after having received a SYN.
1179  *	This still operates on a request_sock only, not on a big
1180  *	socket.
1181  */
1182 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1183 			      struct flowi *fl,
1184 			      struct request_sock *req,
1185 			      struct tcp_fastopen_cookie *foc,
1186 			      enum tcp_synack_type synack_type,
1187 			      struct sk_buff *syn_skb)
1188 {
1189 	struct inet_request_sock *ireq = inet_rsk(req);
1190 	struct flowi4 fl4;
1191 	int err = -1;
1192 	struct sk_buff *skb;
1193 	u8 tos;
1194 
1195 	/* First, grab a route. */
1196 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1197 		return -1;
1198 
1199 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1200 
1201 	if (skb) {
1202 		tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1203 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1204 
1205 		tos = READ_ONCE(inet_sk(sk)->tos);
1206 
1207 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1208 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1209 			      (tos & INET_ECN_MASK);
1210 
1211 		if (!INET_ECN_is_capable(tos) &&
1212 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1213 			tos |= INET_ECN_ECT_0;
1214 
1215 		rcu_read_lock();
1216 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1217 					    ireq->ir_rmt_addr,
1218 					    rcu_dereference(ireq->ireq_opt),
1219 					    tos);
1220 		rcu_read_unlock();
1221 		err = net_xmit_eval(err);
1222 	}
1223 
1224 	return err;
1225 }
1226 
1227 /*
1228  *	IPv4 request_sock destructor.
1229  */
1230 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1231 {
1232 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1233 }
1234 
1235 #ifdef CONFIG_TCP_MD5SIG
1236 /*
1237  * RFC2385 MD5 checksumming requires a mapping of
1238  * IP address->MD5 Key.
1239  * We need to maintain these in the sk structure.
1240  */
1241 
1242 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1243 EXPORT_IPV6_MOD(tcp_md5_needed);
1244 
1245 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1246 {
1247 	if (!old)
1248 		return true;
1249 
1250 	/* l3index always overrides non-l3index */
1251 	if (old->l3index && new->l3index == 0)
1252 		return false;
1253 	if (old->l3index == 0 && new->l3index)
1254 		return true;
1255 
1256 	return old->prefixlen < new->prefixlen;
1257 }
1258 
1259 /* Find the Key structure for an address.  */
1260 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1261 					   const union tcp_md5_addr *addr,
1262 					   int family, bool any_l3index)
1263 {
1264 	const struct tcp_sock *tp = tcp_sk(sk);
1265 	struct tcp_md5sig_key *key;
1266 	const struct tcp_md5sig_info *md5sig;
1267 	__be32 mask;
1268 	struct tcp_md5sig_key *best_match = NULL;
1269 	bool match;
1270 
1271 	/* caller either holds rcu_read_lock() or socket lock */
1272 	md5sig = rcu_dereference_check(tp->md5sig_info,
1273 				       lockdep_sock_is_held(sk));
1274 	if (!md5sig)
1275 		return NULL;
1276 
1277 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1278 				 lockdep_sock_is_held(sk)) {
1279 		if (key->family != family)
1280 			continue;
1281 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1282 		    key->l3index != l3index)
1283 			continue;
1284 		if (family == AF_INET) {
1285 			mask = inet_make_mask(key->prefixlen);
1286 			match = (key->addr.a4.s_addr & mask) ==
1287 				(addr->a4.s_addr & mask);
1288 #if IS_ENABLED(CONFIG_IPV6)
1289 		} else if (family == AF_INET6) {
1290 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1291 						  key->prefixlen);
1292 #endif
1293 		} else {
1294 			match = false;
1295 		}
1296 
1297 		if (match && better_md5_match(best_match, key))
1298 			best_match = key;
1299 	}
1300 	return best_match;
1301 }
1302 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1303 
1304 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1305 						      const union tcp_md5_addr *addr,
1306 						      int family, u8 prefixlen,
1307 						      int l3index, u8 flags)
1308 {
1309 	const struct tcp_sock *tp = tcp_sk(sk);
1310 	struct tcp_md5sig_key *key;
1311 	unsigned int size = sizeof(struct in_addr);
1312 	const struct tcp_md5sig_info *md5sig;
1313 
1314 	/* caller either holds rcu_read_lock() or socket lock */
1315 	md5sig = rcu_dereference_check(tp->md5sig_info,
1316 				       lockdep_sock_is_held(sk));
1317 	if (!md5sig)
1318 		return NULL;
1319 #if IS_ENABLED(CONFIG_IPV6)
1320 	if (family == AF_INET6)
1321 		size = sizeof(struct in6_addr);
1322 #endif
1323 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1324 				 lockdep_sock_is_held(sk)) {
1325 		if (key->family != family)
1326 			continue;
1327 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1328 			continue;
1329 		if (key->l3index != l3index)
1330 			continue;
1331 		if (!memcmp(&key->addr, addr, size) &&
1332 		    key->prefixlen == prefixlen)
1333 			return key;
1334 	}
1335 	return NULL;
1336 }
1337 
1338 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1339 					 const struct sock *addr_sk)
1340 {
1341 	const union tcp_md5_addr *addr;
1342 	int l3index;
1343 
1344 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1345 						 addr_sk->sk_bound_dev_if);
1346 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1347 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1348 }
1349 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1350 
1351 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1352 {
1353 	struct tcp_sock *tp = tcp_sk(sk);
1354 	struct tcp_md5sig_info *md5sig;
1355 
1356 	md5sig = kmalloc_obj(*md5sig, gfp);
1357 	if (!md5sig)
1358 		return -ENOMEM;
1359 
1360 	sk_gso_disable(sk);
1361 	INIT_HLIST_HEAD(&md5sig->head);
1362 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1363 	return 0;
1364 }
1365 
1366 /* This can be called on a newly created socket, from other files */
1367 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1368 			    int family, u8 prefixlen, int l3index, u8 flags,
1369 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1370 {
1371 	/* Add Key to the list */
1372 	struct tcp_md5sig_key *key;
1373 	struct tcp_sock *tp = tcp_sk(sk);
1374 	struct tcp_md5sig_info *md5sig;
1375 
1376 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1377 	if (key) {
1378 		/* Pre-existing entry - just update that one.
1379 		 * Note that the key might be used concurrently.
1380 		 * data_race() is telling kcsan that we do not care of
1381 		 * key mismatches, since changing MD5 key on live flows
1382 		 * can lead to packet drops.
1383 		 */
1384 		data_race(memcpy(key->key, newkey, newkeylen));
1385 
1386 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1387 		 * Also note that a reader could catch new key->keylen value
1388 		 * but old key->key[], this is the reason we use __GFP_ZERO
1389 		 * at sock_kmalloc() time below these lines.
1390 		 */
1391 		WRITE_ONCE(key->keylen, newkeylen);
1392 
1393 		return 0;
1394 	}
1395 
1396 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1397 					   lockdep_sock_is_held(sk));
1398 
1399 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1400 	if (!key)
1401 		return -ENOMEM;
1402 
1403 	memcpy(key->key, newkey, newkeylen);
1404 	key->keylen = newkeylen;
1405 	key->family = family;
1406 	key->prefixlen = prefixlen;
1407 	key->l3index = l3index;
1408 	key->flags = flags;
1409 	memcpy(&key->addr, addr,
1410 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1411 								 sizeof(struct in_addr));
1412 	hlist_add_head_rcu(&key->node, &md5sig->head);
1413 	return 0;
1414 }
1415 
1416 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1417 		   int family, u8 prefixlen, int l3index, u8 flags,
1418 		   const u8 *newkey, u8 newkeylen)
1419 {
1420 	struct tcp_sock *tp = tcp_sk(sk);
1421 
1422 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1423 		if (fips_enabled) {
1424 			pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1425 			return -EOPNOTSUPP;
1426 		}
1427 
1428 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1429 			return -ENOMEM;
1430 
1431 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1432 			struct tcp_md5sig_info *md5sig;
1433 
1434 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1435 			rcu_assign_pointer(tp->md5sig_info, NULL);
1436 			kfree_rcu(md5sig, rcu);
1437 			return -EUSERS;
1438 		}
1439 	}
1440 
1441 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1442 				newkey, newkeylen, GFP_KERNEL);
1443 }
1444 EXPORT_IPV6_MOD(tcp_md5_do_add);
1445 
1446 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1447 		     int family, u8 prefixlen, int l3index,
1448 		     struct tcp_md5sig_key *key)
1449 {
1450 	struct tcp_sock *tp = tcp_sk(sk);
1451 
1452 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1453 
1454 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1455 			return -ENOMEM;
1456 
1457 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1458 			struct tcp_md5sig_info *md5sig;
1459 
1460 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1461 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1462 			rcu_assign_pointer(tp->md5sig_info, NULL);
1463 			kfree_rcu(md5sig, rcu);
1464 			return -EUSERS;
1465 		}
1466 	}
1467 
1468 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1469 				key->flags, key->key, key->keylen,
1470 				sk_gfp_mask(sk, GFP_ATOMIC));
1471 }
1472 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1473 
1474 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1475 		   u8 prefixlen, int l3index, u8 flags)
1476 {
1477 	struct tcp_md5sig_key *key;
1478 
1479 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1480 	if (!key)
1481 		return -ENOENT;
1482 	hlist_del_rcu(&key->node);
1483 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1484 	kfree_rcu(key, rcu);
1485 	return 0;
1486 }
1487 EXPORT_IPV6_MOD(tcp_md5_do_del);
1488 
1489 void tcp_clear_md5_list(struct sock *sk)
1490 {
1491 	struct tcp_sock *tp = tcp_sk(sk);
1492 	struct tcp_md5sig_key *key;
1493 	struct hlist_node *n;
1494 	struct tcp_md5sig_info *md5sig;
1495 
1496 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1497 
1498 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1499 		hlist_del(&key->node);
1500 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1501 		kfree(key);
1502 	}
1503 }
1504 
1505 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1506 				 sockptr_t optval, int optlen)
1507 {
1508 	struct tcp_md5sig cmd;
1509 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1510 	const union tcp_md5_addr *addr;
1511 	u8 prefixlen = 32;
1512 	int l3index = 0;
1513 	bool l3flag;
1514 	u8 flags;
1515 
1516 	if (optlen < sizeof(cmd))
1517 		return -EINVAL;
1518 
1519 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1520 		return -EFAULT;
1521 
1522 	if (sin->sin_family != AF_INET)
1523 		return -EINVAL;
1524 
1525 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1526 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1527 
1528 	if (optname == TCP_MD5SIG_EXT &&
1529 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1530 		prefixlen = cmd.tcpm_prefixlen;
1531 		if (prefixlen > 32)
1532 			return -EINVAL;
1533 	}
1534 
1535 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1536 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1537 		struct net_device *dev;
1538 
1539 		rcu_read_lock();
1540 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1541 		if (dev && netif_is_l3_master(dev))
1542 			l3index = dev->ifindex;
1543 
1544 		rcu_read_unlock();
1545 
1546 		/* ok to reference set/not set outside of rcu;
1547 		 * right now device MUST be an L3 master
1548 		 */
1549 		if (!dev || !l3index)
1550 			return -EINVAL;
1551 	}
1552 
1553 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1554 
1555 	if (!cmd.tcpm_keylen)
1556 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1557 
1558 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1559 		return -EINVAL;
1560 
1561 	/* Don't allow keys for peers that have a matching TCP-AO key.
1562 	 * See the comment in tcp_ao_add_cmd()
1563 	 */
1564 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1565 		return -EKEYREJECTED;
1566 
1567 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1568 			      cmd.tcpm_key, cmd.tcpm_keylen);
1569 }
1570 
1571 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1572 				    __be32 daddr, __be32 saddr,
1573 				    const struct tcphdr *th, int nbytes)
1574 {
1575 	struct {
1576 		struct tcp4_pseudohdr ip;
1577 		struct tcphdr tcp;
1578 	} h;
1579 
1580 	h.ip.saddr = saddr;
1581 	h.ip.daddr = daddr;
1582 	h.ip.pad = 0;
1583 	h.ip.protocol = IPPROTO_TCP;
1584 	h.ip.len = cpu_to_be16(nbytes);
1585 	h.tcp = *th;
1586 	h.tcp.check = 0;
1587 	md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
1588 }
1589 
1590 static noinline_for_stack void
1591 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1592 		    __be32 daddr, __be32 saddr, const struct tcphdr *th)
1593 {
1594 	struct md5_ctx ctx;
1595 
1596 	md5_init(&ctx);
1597 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
1598 	tcp_md5_hash_key(&ctx, key);
1599 	md5_final(&ctx, md5_hash);
1600 }
1601 
1602 noinline_for_stack void
1603 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1604 		    const struct sock *sk, const struct sk_buff *skb)
1605 {
1606 	const struct tcphdr *th = tcp_hdr(skb);
1607 	__be32 saddr, daddr;
1608 	struct md5_ctx ctx;
1609 
1610 	if (sk) { /* valid for establish/request sockets */
1611 		saddr = sk->sk_rcv_saddr;
1612 		daddr = sk->sk_daddr;
1613 	} else {
1614 		const struct iphdr *iph = ip_hdr(skb);
1615 		saddr = iph->saddr;
1616 		daddr = iph->daddr;
1617 	}
1618 
1619 	md5_init(&ctx);
1620 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
1621 	tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
1622 	tcp_md5_hash_key(&ctx, key);
1623 	md5_final(&ctx, md5_hash);
1624 }
1625 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1626 
1627 #endif
1628 
1629 static void tcp_v4_init_req(struct request_sock *req,
1630 			    const struct sock *sk_listener,
1631 			    struct sk_buff *skb)
1632 {
1633 	struct inet_request_sock *ireq = inet_rsk(req);
1634 	struct net *net = sock_net(sk_listener);
1635 
1636 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1637 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1638 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1639 }
1640 
1641 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1642 					  struct sk_buff *skb,
1643 					  struct flowi *fl,
1644 					  struct request_sock *req,
1645 					  u32 tw_isn)
1646 {
1647 	tcp_v4_init_req(req, sk, skb);
1648 
1649 	if (security_inet_conn_request(sk, skb, req))
1650 		return NULL;
1651 
1652 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1653 }
1654 
1655 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1656 	.family		=	PF_INET,
1657 	.obj_size	=	sizeof(struct tcp_request_sock),
1658 	.send_ack	=	tcp_v4_reqsk_send_ack,
1659 	.destructor	=	tcp_v4_reqsk_destructor,
1660 	.send_reset	=	tcp_v4_send_reset,
1661 };
1662 
1663 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1664 	.mss_clamp	=	TCP_MSS_DEFAULT,
1665 #ifdef CONFIG_TCP_MD5SIG
1666 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1667 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1668 #endif
1669 #ifdef CONFIG_TCP_AO
1670 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1671 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1672 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1673 #endif
1674 #ifdef CONFIG_SYN_COOKIES
1675 	.cookie_init_seq =	cookie_v4_init_sequence,
1676 #endif
1677 	.route_req	=	tcp_v4_route_req,
1678 	.init_seq_and_ts_off	=	tcp_v4_init_seq_and_ts_off,
1679 	.send_synack	=	tcp_v4_send_synack,
1680 };
1681 
1682 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1683 {
1684 	/* Never answer to SYNs send to broadcast or multicast */
1685 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1686 		goto drop;
1687 
1688 	return tcp_conn_request(&tcp_request_sock_ops,
1689 				&tcp_request_sock_ipv4_ops, sk, skb);
1690 
1691 drop:
1692 	tcp_listendrop(sk);
1693 	return 0;
1694 }
1695 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1696 
1697 
1698 /*
1699  * The three way handshake has completed - we got a valid synack -
1700  * now create the new socket.
1701  */
1702 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1703 				  struct request_sock *req,
1704 				  struct dst_entry *dst,
1705 				  struct request_sock *req_unhash,
1706 				  bool *own_req,
1707 				  void (*opt_child_init)(struct sock *newsk,
1708 							 const struct sock *sk))
1709 {
1710 	struct inet_request_sock *ireq;
1711 	bool found_dup_sk = false;
1712 	struct inet_sock *newinet;
1713 	struct tcp_sock *newtp;
1714 	struct sock *newsk;
1715 #ifdef CONFIG_TCP_MD5SIG
1716 	const union tcp_md5_addr *addr;
1717 	struct tcp_md5sig_key *key;
1718 	int l3index;
1719 #endif
1720 	struct ip_options_rcu *inet_opt;
1721 
1722 	if (sk_acceptq_is_full(sk))
1723 		goto exit_overflow;
1724 
1725 	newsk = tcp_create_openreq_child(sk, req, skb);
1726 	if (!newsk)
1727 		goto exit_nonewsk;
1728 
1729 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1730 	inet_sk_rx_dst_set(newsk, skb);
1731 
1732 	newtp		      = tcp_sk(newsk);
1733 	newinet		      = inet_sk(newsk);
1734 	ireq		      = inet_rsk(req);
1735 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1736 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1737 	newinet->mc_index     = inet_iif(skb);
1738 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1739 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1740 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1741 	if (inet_opt)
1742 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1743 	atomic_set(&newinet->inet_id, get_random_u16());
1744 
1745 	/* Set ToS of the new socket based upon the value of incoming SYN.
1746 	 * ECT bits are set later in tcp_init_transfer().
1747 	 */
1748 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1749 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1750 
1751 	if (!dst) {
1752 		dst = inet_csk_route_child_sock(sk, newsk, req);
1753 		if (!dst)
1754 			goto put_and_exit;
1755 	} else {
1756 		/* syncookie case : see end of cookie_v4_check() */
1757 	}
1758 	sk_setup_caps(newsk, dst);
1759 
1760 #if IS_ENABLED(CONFIG_IPV6)
1761 	if (opt_child_init)
1762 		opt_child_init(newsk, sk);
1763 #endif
1764 	tcp_ca_openreq_child(newsk, dst);
1765 
1766 	tcp_sync_mss(newsk, dst4_mtu(dst));
1767 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1768 
1769 	tcp_initialize_rcv_mss(newsk);
1770 
1771 #ifdef CONFIG_TCP_MD5SIG
1772 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1773 	/* Copy over the MD5 key from the original socket */
1774 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1775 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1776 	if (key && !tcp_rsk_used_ao(req)) {
1777 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1778 			goto put_and_exit;
1779 		sk_gso_disable(newsk);
1780 	}
1781 #endif
1782 #ifdef CONFIG_TCP_AO
1783 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1784 		goto put_and_exit; /* OOM, release back memory */
1785 #endif
1786 
1787 	if (__inet_inherit_port(sk, newsk) < 0)
1788 		goto put_and_exit;
1789 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1790 				       &found_dup_sk);
1791 	if (likely(*own_req)) {
1792 		tcp_move_syn(newtp, req);
1793 		ireq->ireq_opt = NULL;
1794 	} else {
1795 		newinet->inet_opt = NULL;
1796 
1797 		if (!req_unhash && found_dup_sk) {
1798 			/* This code path should only be executed in the
1799 			 * syncookie case only
1800 			 */
1801 			bh_unlock_sock(newsk);
1802 			sock_put(newsk);
1803 			newsk = NULL;
1804 		}
1805 	}
1806 	return newsk;
1807 
1808 exit_overflow:
1809 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1810 exit_nonewsk:
1811 	dst_release(dst);
1812 exit:
1813 	tcp_listendrop(sk);
1814 	return NULL;
1815 put_and_exit:
1816 	newinet->inet_opt = NULL;
1817 	inet_csk_prepare_forced_close(newsk);
1818 	tcp_done(newsk);
1819 	goto exit;
1820 }
1821 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1822 
1823 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1824 {
1825 #ifdef CONFIG_SYN_COOKIES
1826 	const struct tcphdr *th = tcp_hdr(skb);
1827 
1828 	if (!th->syn)
1829 		sk = cookie_v4_check(sk, skb);
1830 #endif
1831 	return sk;
1832 }
1833 
1834 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1835 			 struct tcphdr *th, u32 *cookie)
1836 {
1837 	u16 mss = 0;
1838 #ifdef CONFIG_SYN_COOKIES
1839 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1840 				    &tcp_request_sock_ipv4_ops, sk, th);
1841 	if (mss) {
1842 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1843 		tcp_synq_overflow(sk);
1844 	}
1845 #endif
1846 	return mss;
1847 }
1848 
1849 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1850 							   u32));
1851 /* The socket must have it's spinlock held when we get
1852  * here, unless it is a TCP_LISTEN socket.
1853  *
1854  * We have a potential double-lock case here, so even when
1855  * doing backlog processing we use the BH locking scheme.
1856  * This is because we cannot sleep with the original spinlock
1857  * held.
1858  */
1859 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1860 {
1861 	enum skb_drop_reason reason;
1862 	struct sock *rsk;
1863 
1864 	reason = psp_sk_rx_policy_check(sk, skb);
1865 	if (reason)
1866 		goto err_discard;
1867 
1868 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1869 		struct dst_entry *dst;
1870 
1871 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1872 						lockdep_sock_is_held(sk));
1873 
1874 		sock_rps_save_rxhash(sk, skb);
1875 		sk_mark_napi_id(sk, skb);
1876 		if (dst) {
1877 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1878 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1879 					     dst, 0)) {
1880 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1881 				dst_release(dst);
1882 			}
1883 		}
1884 		tcp_rcv_established(sk, skb);
1885 		return 0;
1886 	}
1887 
1888 	if (tcp_checksum_complete(skb))
1889 		goto csum_err;
1890 
1891 	if (sk->sk_state == TCP_LISTEN) {
1892 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1893 
1894 		if (!nsk)
1895 			return 0;
1896 		if (nsk != sk) {
1897 			reason = tcp_child_process(sk, nsk, skb);
1898 			if (reason) {
1899 				rsk = nsk;
1900 				goto reset;
1901 			}
1902 			return 0;
1903 		}
1904 	} else
1905 		sock_rps_save_rxhash(sk, skb);
1906 
1907 	reason = tcp_rcv_state_process(sk, skb);
1908 	if (reason) {
1909 		rsk = sk;
1910 		goto reset;
1911 	}
1912 	return 0;
1913 
1914 reset:
1915 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1916 discard:
1917 	sk_skb_reason_drop(sk, skb, reason);
1918 	/* Be careful here. If this function gets more complicated and
1919 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1920 	 * might be destroyed here. This current version compiles correctly,
1921 	 * but you have been warned.
1922 	 */
1923 	return 0;
1924 
1925 csum_err:
1926 	reason = SKB_DROP_REASON_TCP_CSUM;
1927 	trace_tcp_bad_csum(skb);
1928 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1929 err_discard:
1930 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1931 	goto discard;
1932 }
1933 EXPORT_SYMBOL(tcp_v4_do_rcv);
1934 
1935 int tcp_v4_early_demux(struct sk_buff *skb)
1936 {
1937 	struct net *net = dev_net_rcu(skb->dev);
1938 	const struct iphdr *iph;
1939 	const struct tcphdr *th;
1940 	struct sock *sk;
1941 
1942 	if (skb->pkt_type != PACKET_HOST)
1943 		return 0;
1944 
1945 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1946 		return 0;
1947 
1948 	iph = ip_hdr(skb);
1949 	th = tcp_hdr(skb);
1950 
1951 	if (th->doff < sizeof(struct tcphdr) / 4)
1952 		return 0;
1953 
1954 	sk = __inet_lookup_established(net, iph->saddr, th->source,
1955 				       iph->daddr, ntohs(th->dest),
1956 				       skb->skb_iif, inet_sdif(skb));
1957 	if (sk) {
1958 		skb->sk = sk;
1959 		skb->destructor = sock_edemux;
1960 		if (sk_fullsock(sk)) {
1961 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1962 
1963 			if (dst)
1964 				dst = dst_check(dst, 0);
1965 			if (dst &&
1966 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1967 				skb_dst_set_noref(skb, dst);
1968 		}
1969 	}
1970 	return 0;
1971 }
1972 
1973 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1974 		     enum skb_drop_reason *reason)
1975 {
1976 	u32 tail_gso_size, tail_gso_segs;
1977 	struct skb_shared_info *shinfo;
1978 	const struct tcphdr *th;
1979 	struct tcphdr *thtail;
1980 	struct sk_buff *tail;
1981 	unsigned int hdrlen;
1982 	bool fragstolen;
1983 	u32 gso_segs;
1984 	u32 gso_size;
1985 	u64 limit;
1986 	int delta;
1987 	int err;
1988 
1989 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1990 	 * we can fix skb->truesize to its real value to avoid future drops.
1991 	 * This is valid because skb is not yet charged to the socket.
1992 	 * It has been noticed pure SACK packets were sometimes dropped
1993 	 * (if cooked by drivers without copybreak feature).
1994 	 */
1995 	skb_condense(skb);
1996 
1997 	tcp_cleanup_skb(skb);
1998 
1999 	if (unlikely(tcp_checksum_complete(skb))) {
2000 		bh_unlock_sock(sk);
2001 		trace_tcp_bad_csum(skb);
2002 		*reason = SKB_DROP_REASON_TCP_CSUM;
2003 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2004 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2005 		return true;
2006 	}
2007 
2008 	/* Attempt coalescing to last skb in backlog, even if we are
2009 	 * above the limits.
2010 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2011 	 */
2012 	th = (const struct tcphdr *)skb->data;
2013 	hdrlen = th->doff * 4;
2014 
2015 	tail = sk->sk_backlog.tail;
2016 	if (!tail)
2017 		goto no_coalesce;
2018 	thtail = (struct tcphdr *)tail->data;
2019 
2020 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2021 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2022 	    ((TCP_SKB_CB(tail)->tcp_flags |
2023 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2024 	    !((TCP_SKB_CB(tail)->tcp_flags &
2025 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2026 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2027 	      TCP_SKB_CB(skb)->tcp_flags) &
2028 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2029 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2030 	    thtail->doff != th->doff ||
2031 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
2032 	    /* prior to PSP Rx policy check, retain exact PSP metadata */
2033 	    psp_skb_coalesce_diff(tail, skb))
2034 		goto no_coalesce;
2035 
2036 	__skb_pull(skb, hdrlen);
2037 
2038 	shinfo = skb_shinfo(skb);
2039 	gso_size = shinfo->gso_size ?: skb->len;
2040 	gso_segs = shinfo->gso_segs ?: 1;
2041 
2042 	shinfo = skb_shinfo(tail);
2043 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2044 	tail_gso_segs = shinfo->gso_segs ?: 1;
2045 
2046 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2047 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2048 
2049 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2050 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2051 			thtail->window = th->window;
2052 		}
2053 
2054 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2055 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2056 		 * is not entered if we append a packet with a FIN.
2057 		 * SYN, RST, URG are not present.
2058 		 * ACK is set on both packets.
2059 		 * PSH : we do not really care in TCP stack,
2060 		 *       at least for 'GRO' packets.
2061 		 */
2062 		thtail->fin |= th->fin;
2063 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2064 
2065 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2066 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2067 			tail->tstamp = skb->tstamp;
2068 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2069 		}
2070 
2071 		/* Not as strict as GRO. We only need to carry mss max value */
2072 		shinfo->gso_size = max(gso_size, tail_gso_size);
2073 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2074 
2075 		sk->sk_backlog.len += delta;
2076 		__NET_INC_STATS(sock_net(sk),
2077 				LINUX_MIB_TCPBACKLOGCOALESCE);
2078 		kfree_skb_partial(skb, fragstolen);
2079 		return false;
2080 	}
2081 	__skb_push(skb, hdrlen);
2082 
2083 no_coalesce:
2084 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2085 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2086 	 * sk_rcvbuf in normal conditions.
2087 	 */
2088 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2089 
2090 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2091 
2092 	/* Only socket owner can try to collapse/prune rx queues
2093 	 * to reduce memory overhead, so add a little headroom here.
2094 	 * Few sockets backlog are possibly concurrently non empty.
2095 	 */
2096 	limit += 64 * 1024;
2097 
2098 	limit = min_t(u64, limit, UINT_MAX);
2099 
2100 	err = sk_add_backlog(sk, skb, limit);
2101 	if (unlikely(err)) {
2102 		bh_unlock_sock(sk);
2103 		if (err == -ENOMEM) {
2104 			*reason = SKB_DROP_REASON_PFMEMALLOC;
2105 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2106 		} else {
2107 			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2108 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2109 		}
2110 		return true;
2111 	}
2112 	return false;
2113 }
2114 EXPORT_IPV6_MOD(tcp_add_backlog);
2115 
2116 static void tcp_v4_restore_cb(struct sk_buff *skb)
2117 {
2118 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2119 		sizeof(struct inet_skb_parm));
2120 }
2121 
2122 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2123 			   const struct tcphdr *th)
2124 {
2125 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2126 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2127 	 */
2128 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2129 		sizeof(struct inet_skb_parm));
2130 	barrier();
2131 
2132 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2133 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2134 				    skb->len - th->doff * 4);
2135 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2136 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2137 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2138 	TCP_SKB_CB(skb)->sacked	 = 0;
2139 	TCP_SKB_CB(skb)->has_rxtstamp =
2140 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2141 }
2142 
2143 /*
2144  *	From tcp_input.c
2145  */
2146 
2147 int tcp_v4_rcv(struct sk_buff *skb)
2148 {
2149 	struct net *net = dev_net_rcu(skb->dev);
2150 	enum skb_drop_reason drop_reason;
2151 	enum tcp_tw_status tw_status;
2152 	int sdif = inet_sdif(skb);
2153 	int dif = inet_iif(skb);
2154 	const struct iphdr *iph;
2155 	const struct tcphdr *th;
2156 	struct sock *sk = NULL;
2157 	bool refcounted;
2158 	int ret;
2159 	u32 isn;
2160 
2161 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2162 	if (skb->pkt_type != PACKET_HOST)
2163 		goto discard_it;
2164 
2165 	/* Count it even if it's bad */
2166 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2167 
2168 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2169 		goto discard_it;
2170 
2171 	th = (const struct tcphdr *)skb->data;
2172 
2173 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2174 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2175 		goto bad_packet;
2176 	}
2177 	if (!pskb_may_pull(skb, th->doff * 4))
2178 		goto discard_it;
2179 
2180 	/* An explanation is required here, I think.
2181 	 * Packet length and doff are validated by header prediction,
2182 	 * provided case of th->doff==0 is eliminated.
2183 	 * So, we defer the checks. */
2184 
2185 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2186 		goto csum_error;
2187 
2188 	th = (const struct tcphdr *)skb->data;
2189 	iph = ip_hdr(skb);
2190 lookup:
2191 	sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2192 			       th->dest, sdif, &refcounted);
2193 	if (!sk)
2194 		goto no_tcp_socket;
2195 
2196 	if (sk->sk_state == TCP_TIME_WAIT)
2197 		goto do_time_wait;
2198 
2199 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2200 		struct request_sock *req = inet_reqsk(sk);
2201 		bool req_stolen = false;
2202 		struct sock *nsk;
2203 
2204 		sk = req->rsk_listener;
2205 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2206 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2207 		else
2208 			drop_reason = tcp_inbound_hash(sk, req, skb,
2209 						       &iph->saddr, &iph->daddr,
2210 						       AF_INET, dif, sdif);
2211 		if (unlikely(drop_reason)) {
2212 			sk_drops_skbadd(sk, skb);
2213 			reqsk_put(req);
2214 			goto discard_it;
2215 		}
2216 		if (tcp_checksum_complete(skb)) {
2217 			reqsk_put(req);
2218 			goto csum_error;
2219 		}
2220 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2221 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2222 			if (!nsk) {
2223 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2224 				goto lookup;
2225 			}
2226 			sk = nsk;
2227 			/* reuseport_migrate_sock() has already held one sk_refcnt
2228 			 * before returning.
2229 			 */
2230 		} else {
2231 			/* We own a reference on the listener, increase it again
2232 			 * as we might lose it too soon.
2233 			 */
2234 			sock_hold(sk);
2235 		}
2236 		refcounted = true;
2237 		nsk = NULL;
2238 		if (!tcp_filter(sk, skb, &drop_reason)) {
2239 			th = (const struct tcphdr *)skb->data;
2240 			iph = ip_hdr(skb);
2241 			tcp_v4_fill_cb(skb, iph, th);
2242 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2243 					    &drop_reason);
2244 		}
2245 		if (!nsk) {
2246 			reqsk_put(req);
2247 			if (req_stolen) {
2248 				/* Another cpu got exclusive access to req
2249 				 * and created a full blown socket.
2250 				 * Try to feed this packet to this socket
2251 				 * instead of discarding it.
2252 				 */
2253 				tcp_v4_restore_cb(skb);
2254 				sock_put(sk);
2255 				goto lookup;
2256 			}
2257 			goto discard_and_relse;
2258 		}
2259 		nf_reset_ct(skb);
2260 		if (nsk == sk) {
2261 			reqsk_put(req);
2262 			tcp_v4_restore_cb(skb);
2263 		} else {
2264 			drop_reason = tcp_child_process(sk, nsk, skb);
2265 			if (drop_reason) {
2266 				enum sk_rst_reason rst_reason;
2267 
2268 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2269 				tcp_v4_send_reset(nsk, skb, rst_reason);
2270 				goto discard_and_relse;
2271 			}
2272 			sock_put(sk);
2273 			return 0;
2274 		}
2275 	}
2276 
2277 process:
2278 	if (static_branch_unlikely(&ip4_min_ttl)) {
2279 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2280 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2281 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2282 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2283 			goto discard_and_relse;
2284 		}
2285 	}
2286 
2287 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2288 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2289 		goto discard_and_relse;
2290 	}
2291 
2292 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2293 				       AF_INET, dif, sdif);
2294 	if (drop_reason)
2295 		goto discard_and_relse;
2296 
2297 	nf_reset_ct(skb);
2298 
2299 	if (tcp_filter(sk, skb, &drop_reason))
2300 		goto discard_and_relse;
2301 
2302 	th = (const struct tcphdr *)skb->data;
2303 	iph = ip_hdr(skb);
2304 	tcp_v4_fill_cb(skb, iph, th);
2305 
2306 	skb->dev = NULL;
2307 
2308 	if (sk->sk_state == TCP_LISTEN) {
2309 		ret = tcp_v4_do_rcv(sk, skb);
2310 		goto put_and_return;
2311 	}
2312 
2313 	sk_incoming_cpu_update(sk);
2314 
2315 	bh_lock_sock_nested(sk);
2316 	tcp_segs_in(tcp_sk(sk), skb);
2317 	ret = 0;
2318 	if (!sock_owned_by_user(sk)) {
2319 		ret = tcp_v4_do_rcv(sk, skb);
2320 	} else {
2321 		if (tcp_add_backlog(sk, skb, &drop_reason))
2322 			goto discard_and_relse;
2323 	}
2324 	bh_unlock_sock(sk);
2325 
2326 put_and_return:
2327 	if (refcounted)
2328 		sock_put(sk);
2329 
2330 	return ret;
2331 
2332 no_tcp_socket:
2333 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2334 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2335 		goto discard_it;
2336 
2337 	tcp_v4_fill_cb(skb, iph, th);
2338 
2339 	if (tcp_checksum_complete(skb)) {
2340 csum_error:
2341 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2342 		trace_tcp_bad_csum(skb);
2343 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2344 bad_packet:
2345 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2346 	} else {
2347 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2348 	}
2349 
2350 discard_it:
2351 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2352 	/* Discard frame. */
2353 	sk_skb_reason_drop(sk, skb, drop_reason);
2354 	return 0;
2355 
2356 discard_and_relse:
2357 	sk_drops_skbadd(sk, skb);
2358 	if (refcounted)
2359 		sock_put(sk);
2360 	goto discard_it;
2361 
2362 do_time_wait:
2363 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2364 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2365 		inet_twsk_put(inet_twsk(sk));
2366 		goto discard_it;
2367 	}
2368 
2369 	tcp_v4_fill_cb(skb, iph, th);
2370 
2371 	if (tcp_checksum_complete(skb)) {
2372 		inet_twsk_put(inet_twsk(sk));
2373 		goto csum_error;
2374 	}
2375 
2376 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2377 					       &drop_reason);
2378 	switch (tw_status) {
2379 	case TCP_TW_SYN: {
2380 		struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2381 							iph->saddr, th->source,
2382 							iph->daddr, th->dest,
2383 							inet_iif(skb),
2384 							sdif);
2385 		if (sk2) {
2386 			inet_twsk_deschedule_put(inet_twsk(sk));
2387 			sk = sk2;
2388 			tcp_v4_restore_cb(skb);
2389 			refcounted = false;
2390 			__this_cpu_write(tcp_tw_isn, isn);
2391 			goto process;
2392 		}
2393 
2394 		drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2395 		if (drop_reason)
2396 			break;
2397 	}
2398 		/* to ACK */
2399 		fallthrough;
2400 	case TCP_TW_ACK:
2401 	case TCP_TW_ACK_OOW:
2402 		tcp_v4_timewait_ack(sk, skb, tw_status);
2403 		break;
2404 	case TCP_TW_RST:
2405 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2406 		inet_twsk_deschedule_put(inet_twsk(sk));
2407 		goto discard_it;
2408 	case TCP_TW_SUCCESS:;
2409 	}
2410 	goto discard_it;
2411 }
2412 
2413 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2414 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2415 };
2416 
2417 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2418 {
2419 	struct dst_entry *dst = skb_dst(skb);
2420 
2421 	if (dst && dst_hold_safe(dst)) {
2422 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2423 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2424 	}
2425 }
2426 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2427 
2428 const struct inet_connection_sock_af_ops ipv4_specific = {
2429 	.queue_xmit	   = ip_queue_xmit,
2430 	.send_check	   = tcp_v4_send_check,
2431 	.rebuild_header	   = inet_sk_rebuild_header,
2432 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2433 	.conn_request	   = tcp_v4_conn_request,
2434 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2435 	.net_header_len	   = sizeof(struct iphdr),
2436 	.setsockopt	   = ip_setsockopt,
2437 	.getsockopt	   = ip_getsockopt,
2438 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2439 };
2440 EXPORT_IPV6_MOD(ipv4_specific);
2441 
2442 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2443 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2444 #ifdef CONFIG_TCP_MD5SIG
2445 	.md5_lookup		= tcp_v4_md5_lookup,
2446 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2447 	.md5_parse		= tcp_v4_parse_md5_keys,
2448 #endif
2449 #ifdef CONFIG_TCP_AO
2450 	.ao_lookup		= tcp_v4_ao_lookup,
2451 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2452 	.ao_parse		= tcp_v4_parse_ao,
2453 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2454 #endif
2455 };
2456 
2457 static void tcp4_destruct_sock(struct sock *sk)
2458 {
2459 	tcp_md5_destruct_sock(sk);
2460 	tcp_ao_destroy_sock(sk, false);
2461 	inet_sock_destruct(sk);
2462 }
2463 #endif
2464 
2465 /* NOTE: A lot of things set to zero explicitly by call to
2466  *       sk_alloc() so need not be done here.
2467  */
2468 static int tcp_v4_init_sock(struct sock *sk)
2469 {
2470 	struct inet_connection_sock *icsk = inet_csk(sk);
2471 
2472 	tcp_init_sock(sk);
2473 
2474 	icsk->icsk_af_ops = &ipv4_specific;
2475 
2476 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2477 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2478 	sk->sk_destruct = tcp4_destruct_sock;
2479 #endif
2480 
2481 	return 0;
2482 }
2483 
2484 static void tcp_release_user_frags(struct sock *sk)
2485 {
2486 #ifdef CONFIG_PAGE_POOL
2487 	unsigned long index;
2488 	void *netmem;
2489 
2490 	xa_for_each(&sk->sk_user_frags, index, netmem)
2491 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2492 #endif
2493 }
2494 
2495 void tcp_v4_destroy_sock(struct sock *sk)
2496 {
2497 	struct tcp_sock *tp = tcp_sk(sk);
2498 
2499 	tcp_release_user_frags(sk);
2500 
2501 	xa_destroy(&sk->sk_user_frags);
2502 
2503 	trace_tcp_destroy_sock(sk);
2504 
2505 	tcp_clear_xmit_timers(sk);
2506 
2507 	tcp_cleanup_congestion_control(sk);
2508 
2509 	tcp_cleanup_ulp(sk);
2510 
2511 	/* Cleanup up the write buffer. */
2512 	tcp_write_queue_purge(sk);
2513 
2514 	/* Check if we want to disable active TFO */
2515 	tcp_fastopen_active_disable_ofo_check(sk);
2516 
2517 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2518 	skb_rbtree_purge(&tp->out_of_order_queue);
2519 
2520 	/* Clean up a referenced TCP bind bucket. */
2521 	if (inet_csk(sk)->icsk_bind_hash)
2522 		inet_put_port(sk);
2523 
2524 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2525 
2526 	/* If socket is aborted during connect operation */
2527 	tcp_free_fastopen_req(tp);
2528 	tcp_fastopen_destroy_cipher(sk);
2529 	tcp_saved_syn_free(tp);
2530 
2531 	sk_sockets_allocated_dec(sk);
2532 }
2533 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2534 
2535 #ifdef CONFIG_PROC_FS
2536 /* Proc filesystem TCP sock list dumping. */
2537 
2538 static unsigned short seq_file_family(const struct seq_file *seq);
2539 
2540 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2541 {
2542 	unsigned short family = seq_file_family(seq);
2543 
2544 	/* AF_UNSPEC is used as a match all */
2545 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2546 		net_eq(sock_net(sk), seq_file_net(seq)));
2547 }
2548 
2549 /* Find a non empty bucket (starting from st->bucket)
2550  * and return the first sk from it.
2551  */
2552 static void *listening_get_first(struct seq_file *seq)
2553 {
2554 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2555 	struct tcp_iter_state *st = seq->private;
2556 
2557 	st->offset = 0;
2558 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2559 		struct inet_listen_hashbucket *ilb2;
2560 		struct hlist_nulls_node *node;
2561 		struct sock *sk;
2562 
2563 		ilb2 = &hinfo->lhash2[st->bucket];
2564 		if (hlist_nulls_empty(&ilb2->nulls_head))
2565 			continue;
2566 
2567 		spin_lock(&ilb2->lock);
2568 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2569 			if (seq_sk_match(seq, sk))
2570 				return sk;
2571 		}
2572 		spin_unlock(&ilb2->lock);
2573 	}
2574 
2575 	return NULL;
2576 }
2577 
2578 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2579  * If "cur" is the last one in the st->bucket,
2580  * call listening_get_first() to return the first sk of the next
2581  * non empty bucket.
2582  */
2583 static void *listening_get_next(struct seq_file *seq, void *cur)
2584 {
2585 	struct tcp_iter_state *st = seq->private;
2586 	struct inet_listen_hashbucket *ilb2;
2587 	struct hlist_nulls_node *node;
2588 	struct inet_hashinfo *hinfo;
2589 	struct sock *sk = cur;
2590 
2591 	++st->num;
2592 	++st->offset;
2593 
2594 	sk = sk_nulls_next(sk);
2595 	sk_nulls_for_each_from(sk, node) {
2596 		if (seq_sk_match(seq, sk))
2597 			return sk;
2598 	}
2599 
2600 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2601 	ilb2 = &hinfo->lhash2[st->bucket];
2602 	spin_unlock(&ilb2->lock);
2603 	++st->bucket;
2604 	return listening_get_first(seq);
2605 }
2606 
2607 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2608 {
2609 	struct tcp_iter_state *st = seq->private;
2610 	void *rc;
2611 
2612 	st->bucket = 0;
2613 	st->offset = 0;
2614 	rc = listening_get_first(seq);
2615 
2616 	while (rc && *pos) {
2617 		rc = listening_get_next(seq, rc);
2618 		--*pos;
2619 	}
2620 	return rc;
2621 }
2622 
2623 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2624 				const struct tcp_iter_state *st)
2625 {
2626 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2627 }
2628 
2629 /*
2630  * Get first established socket starting from bucket given in st->bucket.
2631  * If st->bucket is zero, the very first socket in the hash is returned.
2632  */
2633 static void *established_get_first(struct seq_file *seq)
2634 {
2635 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2636 	struct tcp_iter_state *st = seq->private;
2637 
2638 	st->offset = 0;
2639 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2640 		struct sock *sk;
2641 		struct hlist_nulls_node *node;
2642 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2643 
2644 		cond_resched();
2645 
2646 		/* Lockless fast path for the common case of empty buckets */
2647 		if (empty_bucket(hinfo, st))
2648 			continue;
2649 
2650 		spin_lock_bh(lock);
2651 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2652 			if (seq_sk_match(seq, sk))
2653 				return sk;
2654 		}
2655 		spin_unlock_bh(lock);
2656 	}
2657 
2658 	return NULL;
2659 }
2660 
2661 static void *established_get_next(struct seq_file *seq, void *cur)
2662 {
2663 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2664 	struct tcp_iter_state *st = seq->private;
2665 	struct hlist_nulls_node *node;
2666 	struct sock *sk = cur;
2667 
2668 	++st->num;
2669 	++st->offset;
2670 
2671 	sk = sk_nulls_next(sk);
2672 
2673 	sk_nulls_for_each_from(sk, node) {
2674 		if (seq_sk_match(seq, sk))
2675 			return sk;
2676 	}
2677 
2678 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2679 	++st->bucket;
2680 	return established_get_first(seq);
2681 }
2682 
2683 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2684 {
2685 	struct tcp_iter_state *st = seq->private;
2686 	void *rc;
2687 
2688 	st->bucket = 0;
2689 	rc = established_get_first(seq);
2690 
2691 	while (rc && pos) {
2692 		rc = established_get_next(seq, rc);
2693 		--pos;
2694 	}
2695 	return rc;
2696 }
2697 
2698 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2699 {
2700 	void *rc;
2701 	struct tcp_iter_state *st = seq->private;
2702 
2703 	st->state = TCP_SEQ_STATE_LISTENING;
2704 	rc	  = listening_get_idx(seq, &pos);
2705 
2706 	if (!rc) {
2707 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2708 		rc	  = established_get_idx(seq, pos);
2709 	}
2710 
2711 	return rc;
2712 }
2713 
2714 static void *tcp_seek_last_pos(struct seq_file *seq)
2715 {
2716 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2717 	struct tcp_iter_state *st = seq->private;
2718 	int bucket = st->bucket;
2719 	int offset = st->offset;
2720 	int orig_num = st->num;
2721 	void *rc = NULL;
2722 
2723 	switch (st->state) {
2724 	case TCP_SEQ_STATE_LISTENING:
2725 		if (st->bucket > hinfo->lhash2_mask)
2726 			break;
2727 		rc = listening_get_first(seq);
2728 		while (offset-- && rc && bucket == st->bucket)
2729 			rc = listening_get_next(seq, rc);
2730 		if (rc)
2731 			break;
2732 		st->bucket = 0;
2733 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2734 		fallthrough;
2735 	case TCP_SEQ_STATE_ESTABLISHED:
2736 		if (st->bucket > hinfo->ehash_mask)
2737 			break;
2738 		rc = established_get_first(seq);
2739 		while (offset-- && rc && bucket == st->bucket)
2740 			rc = established_get_next(seq, rc);
2741 	}
2742 
2743 	st->num = orig_num;
2744 
2745 	return rc;
2746 }
2747 
2748 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2749 {
2750 	struct tcp_iter_state *st = seq->private;
2751 	void *rc;
2752 
2753 	if (*pos && *pos == st->last_pos) {
2754 		rc = tcp_seek_last_pos(seq);
2755 		if (rc)
2756 			goto out;
2757 	}
2758 
2759 	st->state = TCP_SEQ_STATE_LISTENING;
2760 	st->num = 0;
2761 	st->bucket = 0;
2762 	st->offset = 0;
2763 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2764 
2765 out:
2766 	st->last_pos = *pos;
2767 	return rc;
2768 }
2769 EXPORT_IPV6_MOD(tcp_seq_start);
2770 
2771 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2772 {
2773 	struct tcp_iter_state *st = seq->private;
2774 	void *rc = NULL;
2775 
2776 	if (v == SEQ_START_TOKEN) {
2777 		rc = tcp_get_idx(seq, 0);
2778 		goto out;
2779 	}
2780 
2781 	switch (st->state) {
2782 	case TCP_SEQ_STATE_LISTENING:
2783 		rc = listening_get_next(seq, v);
2784 		if (!rc) {
2785 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2786 			st->bucket = 0;
2787 			st->offset = 0;
2788 			rc	  = established_get_first(seq);
2789 		}
2790 		break;
2791 	case TCP_SEQ_STATE_ESTABLISHED:
2792 		rc = established_get_next(seq, v);
2793 		break;
2794 	}
2795 out:
2796 	++*pos;
2797 	st->last_pos = *pos;
2798 	return rc;
2799 }
2800 EXPORT_IPV6_MOD(tcp_seq_next);
2801 
2802 void tcp_seq_stop(struct seq_file *seq, void *v)
2803 {
2804 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2805 	struct tcp_iter_state *st = seq->private;
2806 
2807 	switch (st->state) {
2808 	case TCP_SEQ_STATE_LISTENING:
2809 		if (v != SEQ_START_TOKEN)
2810 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2811 		break;
2812 	case TCP_SEQ_STATE_ESTABLISHED:
2813 		if (v)
2814 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2815 		break;
2816 	}
2817 }
2818 EXPORT_IPV6_MOD(tcp_seq_stop);
2819 
2820 static void get_openreq4(const struct request_sock *req,
2821 			 struct seq_file *f, int i)
2822 {
2823 	const struct inet_request_sock *ireq = inet_rsk(req);
2824 	long delta = req->rsk_timer.expires - jiffies;
2825 
2826 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2827 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2828 		i,
2829 		ireq->ir_loc_addr,
2830 		ireq->ir_num,
2831 		ireq->ir_rmt_addr,
2832 		ntohs(ireq->ir_rmt_port),
2833 		TCP_SYN_RECV,
2834 		0, 0, /* could print option size, but that is af dependent. */
2835 		1,    /* timers active (only the expire timer) */
2836 		jiffies_delta_to_clock_t(delta),
2837 		req->num_timeout,
2838 		from_kuid_munged(seq_user_ns(f),
2839 				 sk_uid(req->rsk_listener)),
2840 		0,  /* non standard timer */
2841 		0, /* open_requests have no inode */
2842 		0,
2843 		req);
2844 }
2845 
2846 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2847 {
2848 	int timer_active;
2849 	unsigned long timer_expires;
2850 	const struct tcp_sock *tp = tcp_sk(sk);
2851 	const struct inet_connection_sock *icsk = inet_csk(sk);
2852 	const struct inet_sock *inet = inet_sk(sk);
2853 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2854 	__be32 dest = inet->inet_daddr;
2855 	__be32 src = inet->inet_rcv_saddr;
2856 	__u16 destp = ntohs(inet->inet_dport);
2857 	__u16 srcp = ntohs(inet->inet_sport);
2858 	u8 icsk_pending;
2859 	int rx_queue;
2860 	int state;
2861 
2862 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2863 	if (icsk_pending == ICSK_TIME_RETRANS ||
2864 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2865 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2866 		timer_active	= 1;
2867 		timer_expires	= tcp_timeout_expires(sk);
2868 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2869 		timer_active	= 4;
2870 		timer_expires	= tcp_timeout_expires(sk);
2871 	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
2872 		timer_active	= 2;
2873 		timer_expires	= icsk->icsk_keepalive_timer.expires;
2874 	} else {
2875 		timer_active	= 0;
2876 		timer_expires = jiffies;
2877 	}
2878 
2879 	state = inet_sk_state_load(sk);
2880 	if (state == TCP_LISTEN)
2881 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2882 	else
2883 		/* Because we don't lock the socket,
2884 		 * we might find a transient negative value.
2885 		 */
2886 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2887 				      READ_ONCE(tp->copied_seq), 0);
2888 
2889 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2890 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2891 		i, src, srcp, dest, destp, state,
2892 		READ_ONCE(tp->write_seq) - tp->snd_una,
2893 		rx_queue,
2894 		timer_active,
2895 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2896 		READ_ONCE(icsk->icsk_retransmits),
2897 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2898 		READ_ONCE(icsk->icsk_probes_out),
2899 		sock_i_ino(sk),
2900 		refcount_read(&sk->sk_refcnt), sk,
2901 		jiffies_to_clock_t(icsk->icsk_rto),
2902 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2903 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2904 		tcp_snd_cwnd(tp),
2905 		state == TCP_LISTEN ?
2906 		    fastopenq->max_qlen :
2907 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2908 }
2909 
2910 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2911 			       struct seq_file *f, int i)
2912 {
2913 	long delta = tw->tw_timer.expires - jiffies;
2914 	__be32 dest, src;
2915 	__u16 destp, srcp;
2916 
2917 	dest  = tw->tw_daddr;
2918 	src   = tw->tw_rcv_saddr;
2919 	destp = ntohs(tw->tw_dport);
2920 	srcp  = ntohs(tw->tw_sport);
2921 
2922 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2923 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2924 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2925 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2926 		refcount_read(&tw->tw_refcnt), tw);
2927 }
2928 
2929 #define TMPSZ 150
2930 
2931 static int tcp4_seq_show(struct seq_file *seq, void *v)
2932 {
2933 	struct tcp_iter_state *st;
2934 	struct sock *sk = v;
2935 
2936 	seq_setwidth(seq, TMPSZ - 1);
2937 	if (v == SEQ_START_TOKEN) {
2938 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2939 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2940 			   "inode");
2941 		goto out;
2942 	}
2943 	st = seq->private;
2944 
2945 	if (sk->sk_state == TCP_TIME_WAIT)
2946 		get_timewait4_sock(v, seq, st->num);
2947 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2948 		get_openreq4(v, seq, st->num);
2949 	else
2950 		get_tcp4_sock(v, seq, st->num);
2951 out:
2952 	seq_pad(seq, '\n');
2953 	return 0;
2954 }
2955 
2956 #ifdef CONFIG_BPF_SYSCALL
2957 union bpf_tcp_iter_batch_item {
2958 	struct sock *sk;
2959 	__u64 cookie;
2960 };
2961 
2962 struct bpf_tcp_iter_state {
2963 	struct tcp_iter_state state;
2964 	unsigned int cur_sk;
2965 	unsigned int end_sk;
2966 	unsigned int max_sk;
2967 	union bpf_tcp_iter_batch_item *batch;
2968 };
2969 
2970 struct bpf_iter__tcp {
2971 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2972 	__bpf_md_ptr(struct sock_common *, sk_common);
2973 	uid_t uid __aligned(8);
2974 };
2975 
2976 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2977 			     struct sock_common *sk_common, uid_t uid)
2978 {
2979 	struct bpf_iter__tcp ctx;
2980 
2981 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2982 	ctx.meta = meta;
2983 	ctx.sk_common = sk_common;
2984 	ctx.uid = uid;
2985 	return bpf_iter_run_prog(prog, &ctx);
2986 }
2987 
2988 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2989 {
2990 	union bpf_tcp_iter_batch_item *item;
2991 	unsigned int cur_sk = iter->cur_sk;
2992 	__u64 cookie;
2993 
2994 	/* Remember the cookies of the sockets we haven't seen yet, so we can
2995 	 * pick up where we left off next time around.
2996 	 */
2997 	while (cur_sk < iter->end_sk) {
2998 		item = &iter->batch[cur_sk++];
2999 		cookie = sock_gen_cookie(item->sk);
3000 		sock_gen_put(item->sk);
3001 		item->cookie = cookie;
3002 	}
3003 }
3004 
3005 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3006 				      unsigned int new_batch_sz, gfp_t flags)
3007 {
3008 	union bpf_tcp_iter_batch_item *new_batch;
3009 
3010 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3011 			     flags | __GFP_NOWARN);
3012 	if (!new_batch)
3013 		return -ENOMEM;
3014 
3015 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3016 	kvfree(iter->batch);
3017 	iter->batch = new_batch;
3018 	iter->max_sk = new_batch_sz;
3019 
3020 	return 0;
3021 }
3022 
3023 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3024 					       union bpf_tcp_iter_batch_item *cookies,
3025 					       int n_cookies)
3026 {
3027 	struct hlist_nulls_node *node;
3028 	struct sock *sk;
3029 	int i;
3030 
3031 	for (i = 0; i < n_cookies; i++) {
3032 		sk = first_sk;
3033 		sk_nulls_for_each_from(sk, node)
3034 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3035 				return sk;
3036 	}
3037 
3038 	return NULL;
3039 }
3040 
3041 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3042 {
3043 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3044 	struct bpf_tcp_iter_state *iter = seq->private;
3045 	struct tcp_iter_state *st = &iter->state;
3046 	unsigned int find_cookie = iter->cur_sk;
3047 	unsigned int end_cookie = iter->end_sk;
3048 	int resume_bucket = st->bucket;
3049 	struct sock *sk;
3050 
3051 	if (end_cookie && find_cookie == end_cookie)
3052 		++st->bucket;
3053 
3054 	sk = listening_get_first(seq);
3055 	iter->cur_sk = 0;
3056 	iter->end_sk = 0;
3057 
3058 	if (sk && st->bucket == resume_bucket && end_cookie) {
3059 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3060 						end_cookie - find_cookie);
3061 		if (!sk) {
3062 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
3063 			++st->bucket;
3064 			sk = listening_get_first(seq);
3065 		}
3066 	}
3067 
3068 	return sk;
3069 }
3070 
3071 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3072 {
3073 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3074 	struct bpf_tcp_iter_state *iter = seq->private;
3075 	struct tcp_iter_state *st = &iter->state;
3076 	unsigned int find_cookie = iter->cur_sk;
3077 	unsigned int end_cookie = iter->end_sk;
3078 	int resume_bucket = st->bucket;
3079 	struct sock *sk;
3080 
3081 	if (end_cookie && find_cookie == end_cookie)
3082 		++st->bucket;
3083 
3084 	sk = established_get_first(seq);
3085 	iter->cur_sk = 0;
3086 	iter->end_sk = 0;
3087 
3088 	if (sk && st->bucket == resume_bucket && end_cookie) {
3089 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3090 						end_cookie - find_cookie);
3091 		if (!sk) {
3092 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3093 			++st->bucket;
3094 			sk = established_get_first(seq);
3095 		}
3096 	}
3097 
3098 	return sk;
3099 }
3100 
3101 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3102 {
3103 	struct bpf_tcp_iter_state *iter = seq->private;
3104 	struct tcp_iter_state *st = &iter->state;
3105 	struct sock *sk = NULL;
3106 
3107 	switch (st->state) {
3108 	case TCP_SEQ_STATE_LISTENING:
3109 		sk = bpf_iter_tcp_resume_listening(seq);
3110 		if (sk)
3111 			break;
3112 		st->bucket = 0;
3113 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3114 		fallthrough;
3115 	case TCP_SEQ_STATE_ESTABLISHED:
3116 		sk = bpf_iter_tcp_resume_established(seq);
3117 		break;
3118 	}
3119 
3120 	return sk;
3121 }
3122 
3123 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3124 						 struct sock **start_sk)
3125 {
3126 	struct bpf_tcp_iter_state *iter = seq->private;
3127 	struct hlist_nulls_node *node;
3128 	unsigned int expected = 1;
3129 	struct sock *sk;
3130 
3131 	sock_hold(*start_sk);
3132 	iter->batch[iter->end_sk++].sk = *start_sk;
3133 
3134 	sk = sk_nulls_next(*start_sk);
3135 	*start_sk = NULL;
3136 	sk_nulls_for_each_from(sk, node) {
3137 		if (seq_sk_match(seq, sk)) {
3138 			if (iter->end_sk < iter->max_sk) {
3139 				sock_hold(sk);
3140 				iter->batch[iter->end_sk++].sk = sk;
3141 			} else if (!*start_sk) {
3142 				/* Remember where we left off. */
3143 				*start_sk = sk;
3144 			}
3145 			expected++;
3146 		}
3147 	}
3148 
3149 	return expected;
3150 }
3151 
3152 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3153 						   struct sock **start_sk)
3154 {
3155 	struct bpf_tcp_iter_state *iter = seq->private;
3156 	struct hlist_nulls_node *node;
3157 	unsigned int expected = 1;
3158 	struct sock *sk;
3159 
3160 	sock_hold(*start_sk);
3161 	iter->batch[iter->end_sk++].sk = *start_sk;
3162 
3163 	sk = sk_nulls_next(*start_sk);
3164 	*start_sk = NULL;
3165 	sk_nulls_for_each_from(sk, node) {
3166 		if (seq_sk_match(seq, sk)) {
3167 			if (iter->end_sk < iter->max_sk) {
3168 				sock_hold(sk);
3169 				iter->batch[iter->end_sk++].sk = sk;
3170 			} else if (!*start_sk) {
3171 				/* Remember where we left off. */
3172 				*start_sk = sk;
3173 			}
3174 			expected++;
3175 		}
3176 	}
3177 
3178 	return expected;
3179 }
3180 
3181 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3182 					struct sock **start_sk)
3183 {
3184 	struct bpf_tcp_iter_state *iter = seq->private;
3185 	struct tcp_iter_state *st = &iter->state;
3186 
3187 	if (st->state == TCP_SEQ_STATE_LISTENING)
3188 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3189 	else
3190 		return bpf_iter_tcp_established_batch(seq, start_sk);
3191 }
3192 
3193 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3194 {
3195 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3196 	struct bpf_tcp_iter_state *iter = seq->private;
3197 	struct tcp_iter_state *st = &iter->state;
3198 
3199 	if (st->state == TCP_SEQ_STATE_LISTENING)
3200 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3201 	else
3202 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3203 }
3204 
3205 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3206 {
3207 	struct bpf_tcp_iter_state *iter = seq->private;
3208 	unsigned int expected;
3209 	struct sock *sk;
3210 	int err;
3211 
3212 	sk = bpf_iter_tcp_resume(seq);
3213 	if (!sk)
3214 		return NULL; /* Done */
3215 
3216 	expected = bpf_iter_fill_batch(seq, &sk);
3217 	if (likely(iter->end_sk == expected))
3218 		goto done;
3219 
3220 	/* Batch size was too small. */
3221 	bpf_iter_tcp_unlock_bucket(seq);
3222 	bpf_iter_tcp_put_batch(iter);
3223 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3224 					 GFP_USER);
3225 	if (err)
3226 		return ERR_PTR(err);
3227 
3228 	sk = bpf_iter_tcp_resume(seq);
3229 	if (!sk)
3230 		return NULL; /* Done */
3231 
3232 	expected = bpf_iter_fill_batch(seq, &sk);
3233 	if (likely(iter->end_sk == expected))
3234 		goto done;
3235 
3236 	/* Batch size was still too small. Hold onto the lock while we try
3237 	 * again with a larger batch to make sure the current bucket's size
3238 	 * does not change in the meantime.
3239 	 */
3240 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3241 	if (err) {
3242 		bpf_iter_tcp_unlock_bucket(seq);
3243 		return ERR_PTR(err);
3244 	}
3245 
3246 	expected = bpf_iter_fill_batch(seq, &sk);
3247 	WARN_ON_ONCE(iter->end_sk != expected);
3248 done:
3249 	bpf_iter_tcp_unlock_bucket(seq);
3250 	return iter->batch[0].sk;
3251 }
3252 
3253 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3254 {
3255 	/* bpf iter does not support lseek, so it always
3256 	 * continue from where it was stop()-ped.
3257 	 */
3258 	if (*pos)
3259 		return bpf_iter_tcp_batch(seq);
3260 
3261 	return SEQ_START_TOKEN;
3262 }
3263 
3264 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3265 {
3266 	struct bpf_tcp_iter_state *iter = seq->private;
3267 	struct tcp_iter_state *st = &iter->state;
3268 	struct sock *sk;
3269 
3270 	/* Whenever seq_next() is called, the iter->cur_sk is
3271 	 * done with seq_show(), so advance to the next sk in
3272 	 * the batch.
3273 	 */
3274 	if (iter->cur_sk < iter->end_sk) {
3275 		/* Keeping st->num consistent in tcp_iter_state.
3276 		 * bpf_iter_tcp does not use st->num.
3277 		 * meta.seq_num is used instead.
3278 		 */
3279 		st->num++;
3280 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3281 	}
3282 
3283 	if (iter->cur_sk < iter->end_sk)
3284 		sk = iter->batch[iter->cur_sk].sk;
3285 	else
3286 		sk = bpf_iter_tcp_batch(seq);
3287 
3288 	++*pos;
3289 	/* Keeping st->last_pos consistent in tcp_iter_state.
3290 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3291 	 */
3292 	st->last_pos = *pos;
3293 	return sk;
3294 }
3295 
3296 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3297 {
3298 	struct bpf_iter_meta meta;
3299 	struct bpf_prog *prog;
3300 	struct sock *sk = v;
3301 	uid_t uid;
3302 	int ret;
3303 
3304 	if (v == SEQ_START_TOKEN)
3305 		return 0;
3306 
3307 	if (sk_fullsock(sk))
3308 		lock_sock(sk);
3309 
3310 	if (unlikely(sk_unhashed(sk))) {
3311 		ret = SEQ_SKIP;
3312 		goto unlock;
3313 	}
3314 
3315 	if (sk->sk_state == TCP_TIME_WAIT) {
3316 		uid = 0;
3317 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3318 		const struct request_sock *req = v;
3319 
3320 		uid = from_kuid_munged(seq_user_ns(seq),
3321 				       sk_uid(req->rsk_listener));
3322 	} else {
3323 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3324 	}
3325 
3326 	meta.seq = seq;
3327 	prog = bpf_iter_get_info(&meta, false);
3328 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3329 
3330 unlock:
3331 	if (sk_fullsock(sk))
3332 		release_sock(sk);
3333 	return ret;
3334 
3335 }
3336 
3337 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3338 {
3339 	struct bpf_tcp_iter_state *iter = seq->private;
3340 	struct bpf_iter_meta meta;
3341 	struct bpf_prog *prog;
3342 
3343 	if (!v) {
3344 		meta.seq = seq;
3345 		prog = bpf_iter_get_info(&meta, true);
3346 		if (prog)
3347 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3348 	}
3349 
3350 	if (iter->cur_sk < iter->end_sk)
3351 		bpf_iter_tcp_put_batch(iter);
3352 }
3353 
3354 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3355 	.show		= bpf_iter_tcp_seq_show,
3356 	.start		= bpf_iter_tcp_seq_start,
3357 	.next		= bpf_iter_tcp_seq_next,
3358 	.stop		= bpf_iter_tcp_seq_stop,
3359 };
3360 #endif
3361 static unsigned short seq_file_family(const struct seq_file *seq)
3362 {
3363 	const struct tcp_seq_afinfo *afinfo;
3364 
3365 #ifdef CONFIG_BPF_SYSCALL
3366 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3367 	if (seq->op == &bpf_iter_tcp_seq_ops)
3368 		return AF_UNSPEC;
3369 #endif
3370 
3371 	/* Iterated from proc fs */
3372 	afinfo = pde_data(file_inode(seq->file));
3373 	return afinfo->family;
3374 }
3375 
3376 static const struct seq_operations tcp4_seq_ops = {
3377 	.show		= tcp4_seq_show,
3378 	.start		= tcp_seq_start,
3379 	.next		= tcp_seq_next,
3380 	.stop		= tcp_seq_stop,
3381 };
3382 
3383 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3384 	.family		= AF_INET,
3385 };
3386 
3387 static int __net_init tcp4_proc_init_net(struct net *net)
3388 {
3389 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3390 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3391 		return -ENOMEM;
3392 	return 0;
3393 }
3394 
3395 static void __net_exit tcp4_proc_exit_net(struct net *net)
3396 {
3397 	remove_proc_entry("tcp", net->proc_net);
3398 }
3399 
3400 static struct pernet_operations tcp4_net_ops = {
3401 	.init = tcp4_proc_init_net,
3402 	.exit = tcp4_proc_exit_net,
3403 };
3404 
3405 int __init tcp4_proc_init(void)
3406 {
3407 	return register_pernet_subsys(&tcp4_net_ops);
3408 }
3409 
3410 void tcp4_proc_exit(void)
3411 {
3412 	unregister_pernet_subsys(&tcp4_net_ops);
3413 }
3414 #endif /* CONFIG_PROC_FS */
3415 
3416 struct proto tcp_prot = {
3417 	.name			= "TCP",
3418 	.owner			= THIS_MODULE,
3419 	.close			= tcp_close,
3420 	.pre_connect		= tcp_v4_pre_connect,
3421 	.connect		= tcp_v4_connect,
3422 	.disconnect		= tcp_disconnect,
3423 	.accept			= inet_csk_accept,
3424 	.ioctl			= tcp_ioctl,
3425 	.init			= tcp_v4_init_sock,
3426 	.destroy		= tcp_v4_destroy_sock,
3427 	.shutdown		= tcp_shutdown,
3428 	.setsockopt		= tcp_setsockopt,
3429 	.getsockopt		= tcp_getsockopt,
3430 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3431 	.keepalive		= tcp_set_keepalive,
3432 	.recvmsg		= tcp_recvmsg,
3433 	.sendmsg		= tcp_sendmsg,
3434 	.splice_eof		= tcp_splice_eof,
3435 	.backlog_rcv		= tcp_v4_do_rcv,
3436 	.release_cb		= tcp_release_cb,
3437 	.hash			= inet_hash,
3438 	.unhash			= inet_unhash,
3439 	.get_port		= inet_csk_get_port,
3440 	.put_port		= inet_put_port,
3441 #ifdef CONFIG_BPF_SYSCALL
3442 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3443 #endif
3444 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3445 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3446 	.stream_memory_free	= tcp_stream_memory_free,
3447 	.sockets_allocated	= &tcp_sockets_allocated,
3448 
3449 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3450 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3451 
3452 	.memory_pressure	= &tcp_memory_pressure,
3453 	.sysctl_mem		= sysctl_tcp_mem,
3454 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3455 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3456 	.max_header		= MAX_TCP_HEADER,
3457 	.obj_size		= sizeof(struct tcp_sock),
3458 	.freeptr_offset		= offsetof(struct tcp_sock,
3459 					   inet_conn.icsk_inet.sk.sk_freeptr),
3460 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3461 	.twsk_prot		= &tcp_timewait_sock_ops,
3462 	.rsk_prot		= &tcp_request_sock_ops,
3463 	.h.hashinfo		= NULL,
3464 	.no_autobind		= true,
3465 	.diag_destroy		= tcp_abort,
3466 };
3467 EXPORT_SYMBOL(tcp_prot);
3468 
3469 static void __net_exit tcp_sk_exit(struct net *net)
3470 {
3471 	if (net->ipv4.tcp_congestion_control)
3472 		bpf_module_put(net->ipv4.tcp_congestion_control,
3473 			       net->ipv4.tcp_congestion_control->owner);
3474 }
3475 
3476 static void __net_init tcp_set_hashinfo(struct net *net)
3477 {
3478 	struct inet_hashinfo *hinfo;
3479 	unsigned int ehash_entries;
3480 	struct net *old_net;
3481 
3482 	if (net_eq(net, &init_net))
3483 		goto fallback;
3484 
3485 	old_net = current->nsproxy->net_ns;
3486 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3487 	if (!ehash_entries)
3488 		goto fallback;
3489 
3490 	ehash_entries = roundup_pow_of_two(ehash_entries);
3491 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3492 	if (!hinfo) {
3493 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3494 			"for a netns, fallback to the global one\n",
3495 			ehash_entries);
3496 fallback:
3497 		hinfo = &tcp_hashinfo;
3498 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3499 	}
3500 
3501 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3502 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3503 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3504 }
3505 
3506 static int __net_init tcp_sk_init(struct net *net)
3507 {
3508 	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3509 	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3510 	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3511 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3512 
3513 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3514 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3515 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3516 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3517 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3518 
3519 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3520 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3521 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3522 
3523 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3524 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3525 	net->ipv4.sysctl_tcp_syncookies = 1;
3526 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3527 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3528 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3529 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3530 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3531 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3532 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3533 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3534 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3535 
3536 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3537 	tcp_set_hashinfo(net);
3538 
3539 	net->ipv4.sysctl_tcp_sack = 1;
3540 	net->ipv4.sysctl_tcp_window_scaling = 1;
3541 	net->ipv4.sysctl_tcp_timestamps = 1;
3542 	net->ipv4.sysctl_tcp_early_retrans = 3;
3543 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3544 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3545 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3546 	net->ipv4.sysctl_tcp_max_reordering = 300;
3547 	net->ipv4.sysctl_tcp_dsack = 1;
3548 	net->ipv4.sysctl_tcp_app_win = 31;
3549 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3550 	net->ipv4.sysctl_tcp_frto = 2;
3551 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3552 	net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3553 	/* This limits the percentage of the congestion window which we
3554 	 * will allow a single TSO frame to consume.  Building TSO frames
3555 	 * which are too large can cause TCP streams to be bursty.
3556 	 */
3557 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3558 	/* Default TSQ limit of 4 MB */
3559 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3560 
3561 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3562 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3563 
3564 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3565 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3566 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3567 	net->ipv4.sysctl_tcp_autocorking = 1;
3568 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3569 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3570 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3571 	if (net != &init_net) {
3572 		memcpy(net->ipv4.sysctl_tcp_rmem,
3573 		       init_net.ipv4.sysctl_tcp_rmem,
3574 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3575 		memcpy(net->ipv4.sysctl_tcp_wmem,
3576 		       init_net.ipv4.sysctl_tcp_wmem,
3577 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3578 	}
3579 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3580 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
3581 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3582 	net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
3583 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3584 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3585 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3586 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3587 
3588 	/* Set default values for PLB */
3589 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3590 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3591 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3592 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3593 	/* Default congestion threshold for PLB to mark a round is 50% */
3594 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3595 
3596 	/* Reno is always built in */
3597 	if (!net_eq(net, &init_net) &&
3598 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3599 			       init_net.ipv4.tcp_congestion_control->owner))
3600 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3601 	else
3602 		net->ipv4.tcp_congestion_control = &tcp_reno;
3603 
3604 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3605 	net->ipv4.sysctl_tcp_shrink_window = 0;
3606 
3607 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3608 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3609 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3610 
3611 	return 0;
3612 }
3613 
3614 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3615 {
3616 	struct net *net;
3617 
3618 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3619 	 * and failed setup_net error unwinding path are serialized.
3620 	 *
3621 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3622 	 * net_exit_list, the thread that dismantles a particular twsk must
3623 	 * do so without other thread progressing to refcount_dec_and_test() of
3624 	 * tcp_death_row.tw_refcount.
3625 	 */
3626 	mutex_lock(&tcp_exit_batch_mutex);
3627 
3628 	tcp_twsk_purge(net_exit_list);
3629 
3630 	list_for_each_entry(net, net_exit_list, exit_list) {
3631 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3632 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3633 		tcp_fastopen_ctx_destroy(net);
3634 	}
3635 
3636 	mutex_unlock(&tcp_exit_batch_mutex);
3637 }
3638 
3639 static struct pernet_operations __net_initdata tcp_sk_ops = {
3640        .init	   = tcp_sk_init,
3641        .exit	   = tcp_sk_exit,
3642        .exit_batch = tcp_sk_exit_batch,
3643 };
3644 
3645 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3646 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3647 		     struct sock_common *sk_common, uid_t uid)
3648 
3649 #define INIT_BATCH_SZ 16
3650 
3651 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3652 {
3653 	struct bpf_tcp_iter_state *iter = priv_data;
3654 	int err;
3655 
3656 	err = bpf_iter_init_seq_net(priv_data, aux);
3657 	if (err)
3658 		return err;
3659 
3660 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3661 	if (err) {
3662 		bpf_iter_fini_seq_net(priv_data);
3663 		return err;
3664 	}
3665 
3666 	return 0;
3667 }
3668 
3669 static void bpf_iter_fini_tcp(void *priv_data)
3670 {
3671 	struct bpf_tcp_iter_state *iter = priv_data;
3672 
3673 	bpf_iter_fini_seq_net(priv_data);
3674 	kvfree(iter->batch);
3675 }
3676 
3677 static const struct bpf_iter_seq_info tcp_seq_info = {
3678 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3679 	.init_seq_private	= bpf_iter_init_tcp,
3680 	.fini_seq_private	= bpf_iter_fini_tcp,
3681 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3682 };
3683 
3684 static const struct bpf_func_proto *
3685 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3686 			    const struct bpf_prog *prog)
3687 {
3688 	switch (func_id) {
3689 	case BPF_FUNC_setsockopt:
3690 		return &bpf_sk_setsockopt_proto;
3691 	case BPF_FUNC_getsockopt:
3692 		return &bpf_sk_getsockopt_proto;
3693 	default:
3694 		return NULL;
3695 	}
3696 }
3697 
3698 static struct bpf_iter_reg tcp_reg_info = {
3699 	.target			= "tcp",
3700 	.ctx_arg_info_size	= 1,
3701 	.ctx_arg_info		= {
3702 		{ offsetof(struct bpf_iter__tcp, sk_common),
3703 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3704 	},
3705 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3706 	.seq_info		= &tcp_seq_info,
3707 };
3708 
3709 static void __init bpf_iter_register(void)
3710 {
3711 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3712 	if (bpf_iter_reg_target(&tcp_reg_info))
3713 		pr_warn("Warning: could not register bpf iterator tcp\n");
3714 }
3715 
3716 #endif
3717 
3718 void __init tcp_v4_init(void)
3719 {
3720 	int cpu, res;
3721 
3722 	for_each_possible_cpu(cpu) {
3723 		struct sock *sk;
3724 
3725 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3726 					   IPPROTO_TCP, &init_net);
3727 		if (res)
3728 			panic("Failed to create the TCP control socket.\n");
3729 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3730 
3731 		/* Please enforce IP_DF and IPID==0 for RST and
3732 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3733 		 */
3734 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3735 
3736 		sk->sk_clockid = CLOCK_MONOTONIC;
3737 
3738 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3739 	}
3740 	if (register_pernet_subsys(&tcp_sk_ops))
3741 		panic("Failed to create the TCP control socket.\n");
3742 
3743 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3744 	bpf_iter_register();
3745 #endif
3746 }
3747