xref: /linux/net/ipv4/tcp_ipv4.c (revision a7ddedc84c59a645ef970b992f7cda5bffc70cc0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 #include <linux/sock_diag.h>
62 
63 #include <net/aligned_data.h>
64 #include <net/net_namespace.h>
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/inet_ecn.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/secure_seq.h>
75 #include <net/busy_poll.h>
76 #include <net/rstreason.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 #include <linux/btf_ids.h>
85 #include <linux/skbuff_ref.h>
86 
87 #include <crypto/hash.h>
88 #include <linux/scatterlist.h>
89 
90 #include <trace/events/tcp.h>
91 
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96 
97 struct inet_hashinfo tcp_hashinfo;
98 
99 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
100 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
101 };
102 
103 static DEFINE_MUTEX(tcp_exit_batch_mutex);
104 
105 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
106 {
107 	return secure_tcp_seq(ip_hdr(skb)->daddr,
108 			      ip_hdr(skb)->saddr,
109 			      tcp_hdr(skb)->dest,
110 			      tcp_hdr(skb)->source);
111 }
112 
113 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
114 {
115 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
116 }
117 
118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
119 {
120 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
121 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
122 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
123 	struct tcp_sock *tp = tcp_sk(sk);
124 	int ts_recent_stamp;
125 	u32 reuse_thresh;
126 
127 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
128 		reuse = 0;
129 
130 	if (reuse == 2) {
131 		/* Still does not detect *everything* that goes through
132 		 * lo, since we require a loopback src or dst address
133 		 * or direct binding to 'lo' interface.
134 		 */
135 		bool loopback = false;
136 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
137 			loopback = true;
138 #if IS_ENABLED(CONFIG_IPV6)
139 		if (tw->tw_family == AF_INET6) {
140 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
141 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
142 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
143 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
144 				loopback = true;
145 		} else
146 #endif
147 		{
148 			if (ipv4_is_loopback(tw->tw_daddr) ||
149 			    ipv4_is_loopback(tw->tw_rcv_saddr))
150 				loopback = true;
151 		}
152 		if (!loopback)
153 			reuse = 0;
154 	}
155 
156 	/* With PAWS, it is safe from the viewpoint
157 	   of data integrity. Even without PAWS it is safe provided sequence
158 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
159 
160 	   Actually, the idea is close to VJ's one, only timestamp cache is
161 	   held not per host, but per port pair and TW bucket is used as state
162 	   holder.
163 
164 	   If TW bucket has been already destroyed we fall back to VJ's scheme
165 	   and use initial timestamp retrieved from peer table.
166 	 */
167 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
168 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
169 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
170 	if (ts_recent_stamp &&
171 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
172 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
173 		 * and releasing the bucket lock.
174 		 */
175 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
176 			return 0;
177 
178 		/* In case of repair and re-using TIME-WAIT sockets we still
179 		 * want to be sure that it is safe as above but honor the
180 		 * sequence numbers and time stamps set as part of the repair
181 		 * process.
182 		 *
183 		 * Without this check re-using a TIME-WAIT socket with TCP
184 		 * repair would accumulate a -1 on the repair assigned
185 		 * sequence number. The first time it is reused the sequence
186 		 * is -1, the second time -2, etc. This fixes that issue
187 		 * without appearing to create any others.
188 		 */
189 		if (likely(!tp->repair)) {
190 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
191 
192 			if (!seq)
193 				seq = 1;
194 			WRITE_ONCE(tp->write_seq, seq);
195 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
196 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
197 		}
198 
199 		return 1;
200 	}
201 
202 	return 0;
203 }
204 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
205 
206 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
207 			      int addr_len)
208 {
209 	/* This check is replicated from tcp_v4_connect() and intended to
210 	 * prevent BPF program called below from accessing bytes that are out
211 	 * of the bound specified by user in addr_len.
212 	 */
213 	if (addr_len < sizeof(struct sockaddr_in))
214 		return -EINVAL;
215 
216 	sock_owned_by_me(sk);
217 
218 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
219 }
220 
221 /* This will initiate an outgoing connection. */
222 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
223 {
224 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
225 	struct inet_timewait_death_row *tcp_death_row;
226 	struct inet_sock *inet = inet_sk(sk);
227 	struct tcp_sock *tp = tcp_sk(sk);
228 	struct ip_options_rcu *inet_opt;
229 	struct net *net = sock_net(sk);
230 	__be16 orig_sport, orig_dport;
231 	__be32 daddr, nexthop;
232 	struct flowi4 *fl4;
233 	struct rtable *rt;
234 	int err;
235 
236 	if (addr_len < sizeof(struct sockaddr_in))
237 		return -EINVAL;
238 
239 	if (usin->sin_family != AF_INET)
240 		return -EAFNOSUPPORT;
241 
242 	nexthop = daddr = usin->sin_addr.s_addr;
243 	inet_opt = rcu_dereference_protected(inet->inet_opt,
244 					     lockdep_sock_is_held(sk));
245 	if (inet_opt && inet_opt->opt.srr) {
246 		if (!daddr)
247 			return -EINVAL;
248 		nexthop = inet_opt->opt.faddr;
249 	}
250 
251 	orig_sport = inet->inet_sport;
252 	orig_dport = usin->sin_port;
253 	fl4 = &inet->cork.fl.u.ip4;
254 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
255 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
256 			      orig_dport, sk);
257 	if (IS_ERR(rt)) {
258 		err = PTR_ERR(rt);
259 		if (err == -ENETUNREACH)
260 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
261 		return err;
262 	}
263 
264 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
265 		ip_rt_put(rt);
266 		return -ENETUNREACH;
267 	}
268 
269 	if (!inet_opt || !inet_opt->opt.srr)
270 		daddr = fl4->daddr;
271 
272 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
273 
274 	if (!inet->inet_saddr) {
275 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
276 		if (err) {
277 			ip_rt_put(rt);
278 			return err;
279 		}
280 	} else {
281 		sk_rcv_saddr_set(sk, inet->inet_saddr);
282 	}
283 
284 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
285 		/* Reset inherited state */
286 		tp->rx_opt.ts_recent	   = 0;
287 		tp->rx_opt.ts_recent_stamp = 0;
288 		if (likely(!tp->repair))
289 			WRITE_ONCE(tp->write_seq, 0);
290 	}
291 
292 	inet->inet_dport = usin->sin_port;
293 	sk_daddr_set(sk, daddr);
294 
295 	inet_csk(sk)->icsk_ext_hdr_len = 0;
296 	if (inet_opt)
297 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
298 
299 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
300 
301 	/* Socket identity is still unknown (sport may be zero).
302 	 * However we set state to SYN-SENT and not releasing socket
303 	 * lock select source port, enter ourselves into the hash tables and
304 	 * complete initialization after this.
305 	 */
306 	tcp_set_state(sk, TCP_SYN_SENT);
307 	err = inet_hash_connect(tcp_death_row, sk);
308 	if (err)
309 		goto failure;
310 
311 	sk_set_txhash(sk);
312 
313 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
314 			       inet->inet_sport, inet->inet_dport, sk);
315 	if (IS_ERR(rt)) {
316 		err = PTR_ERR(rt);
317 		rt = NULL;
318 		goto failure;
319 	}
320 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
321 	/* OK, now commit destination to socket.  */
322 	sk->sk_gso_type = SKB_GSO_TCPV4;
323 	sk_setup_caps(sk, &rt->dst);
324 	rt = NULL;
325 
326 	if (likely(!tp->repair)) {
327 		if (!tp->write_seq)
328 			WRITE_ONCE(tp->write_seq,
329 				   secure_tcp_seq(inet->inet_saddr,
330 						  inet->inet_daddr,
331 						  inet->inet_sport,
332 						  usin->sin_port));
333 		WRITE_ONCE(tp->tsoffset,
334 			   secure_tcp_ts_off(net, inet->inet_saddr,
335 					     inet->inet_daddr));
336 	}
337 
338 	atomic_set(&inet->inet_id, get_random_u16());
339 
340 	if (tcp_fastopen_defer_connect(sk, &err))
341 		return err;
342 	if (err)
343 		goto failure;
344 
345 	err = tcp_connect(sk);
346 
347 	if (err)
348 		goto failure;
349 
350 	return 0;
351 
352 failure:
353 	/*
354 	 * This unhashes the socket and releases the local port,
355 	 * if necessary.
356 	 */
357 	tcp_set_state(sk, TCP_CLOSE);
358 	inet_bhash2_reset_saddr(sk);
359 	ip_rt_put(rt);
360 	sk->sk_route_caps = 0;
361 	inet->inet_dport = 0;
362 	return err;
363 }
364 EXPORT_IPV6_MOD(tcp_v4_connect);
365 
366 /*
367  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
368  * It can be called through tcp_release_cb() if socket was owned by user
369  * at the time tcp_v4_err() was called to handle ICMP message.
370  */
371 void tcp_v4_mtu_reduced(struct sock *sk)
372 {
373 	struct inet_sock *inet = inet_sk(sk);
374 	struct dst_entry *dst;
375 	u32 mtu;
376 
377 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
378 		return;
379 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
380 	dst = inet_csk_update_pmtu(sk, mtu);
381 	if (!dst)
382 		return;
383 
384 	/* Something is about to be wrong... Remember soft error
385 	 * for the case, if this connection will not able to recover.
386 	 */
387 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
388 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
389 
390 	mtu = dst_mtu(dst);
391 
392 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
393 	    ip_sk_accept_pmtu(sk) &&
394 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
395 		tcp_sync_mss(sk, mtu);
396 
397 		/* Resend the TCP packet because it's
398 		 * clear that the old packet has been
399 		 * dropped. This is the new "fast" path mtu
400 		 * discovery.
401 		 */
402 		tcp_simple_retransmit(sk);
403 	} /* else let the usual retransmit timer handle it */
404 }
405 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
406 
407 static void do_redirect(struct sk_buff *skb, struct sock *sk)
408 {
409 	struct dst_entry *dst = __sk_dst_check(sk, 0);
410 
411 	if (dst)
412 		dst->ops->redirect(dst, sk, skb);
413 }
414 
415 
416 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
417 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
418 {
419 	struct request_sock *req = inet_reqsk(sk);
420 	struct net *net = sock_net(sk);
421 
422 	/* ICMPs are not backlogged, hence we cannot get
423 	 * an established socket here.
424 	 */
425 	if (seq != tcp_rsk(req)->snt_isn) {
426 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
427 	} else if (abort) {
428 		/*
429 		 * Still in SYN_RECV, just remove it silently.
430 		 * There is no good way to pass the error to the newly
431 		 * created socket, and POSIX does not want network
432 		 * errors returned from accept().
433 		 */
434 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
435 		tcp_listendrop(req->rsk_listener);
436 	}
437 	reqsk_put(req);
438 }
439 EXPORT_IPV6_MOD(tcp_req_err);
440 
441 /* TCP-LD (RFC 6069) logic */
442 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
443 {
444 	struct inet_connection_sock *icsk = inet_csk(sk);
445 	struct tcp_sock *tp = tcp_sk(sk);
446 	struct sk_buff *skb;
447 	s32 remaining;
448 	u32 delta_us;
449 
450 	if (sock_owned_by_user(sk))
451 		return;
452 
453 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
454 	    !icsk->icsk_backoff)
455 		return;
456 
457 	skb = tcp_rtx_queue_head(sk);
458 	if (WARN_ON_ONCE(!skb))
459 		return;
460 
461 	icsk->icsk_backoff--;
462 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
463 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
464 
465 	tcp_mstamp_refresh(tp);
466 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
467 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
468 
469 	if (remaining > 0) {
470 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
471 	} else {
472 		/* RTO revert clocked out retransmission.
473 		 * Will retransmit now.
474 		 */
475 		tcp_retransmit_timer(sk);
476 	}
477 }
478 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
479 
480 /*
481  * This routine is called by the ICMP module when it gets some
482  * sort of error condition.  If err < 0 then the socket should
483  * be closed and the error returned to the user.  If err > 0
484  * it's just the icmp type << 8 | icmp code.  After adjustment
485  * header points to the first 8 bytes of the tcp header.  We need
486  * to find the appropriate port.
487  *
488  * The locking strategy used here is very "optimistic". When
489  * someone else accesses the socket the ICMP is just dropped
490  * and for some paths there is no check at all.
491  * A more general error queue to queue errors for later handling
492  * is probably better.
493  *
494  */
495 
496 int tcp_v4_err(struct sk_buff *skb, u32 info)
497 {
498 	const struct iphdr *iph = (const struct iphdr *)skb->data;
499 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
500 	struct net *net = dev_net_rcu(skb->dev);
501 	const int type = icmp_hdr(skb)->type;
502 	const int code = icmp_hdr(skb)->code;
503 	struct request_sock *fastopen;
504 	struct tcp_sock *tp;
505 	u32 seq, snd_una;
506 	struct sock *sk;
507 	int err;
508 
509 	sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
510 				       ntohs(th->source), inet_iif(skb), 0);
511 	if (!sk) {
512 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
513 		return -ENOENT;
514 	}
515 	if (sk->sk_state == TCP_TIME_WAIT) {
516 		/* To increase the counter of ignored icmps for TCP-AO */
517 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
518 		inet_twsk_put(inet_twsk(sk));
519 		return 0;
520 	}
521 	seq = ntohl(th->seq);
522 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
523 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
524 				     type == ICMP_TIME_EXCEEDED ||
525 				     (type == ICMP_DEST_UNREACH &&
526 				      (code == ICMP_NET_UNREACH ||
527 				       code == ICMP_HOST_UNREACH)));
528 		return 0;
529 	}
530 
531 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
532 		sock_put(sk);
533 		return 0;
534 	}
535 
536 	bh_lock_sock(sk);
537 	/* If too many ICMPs get dropped on busy
538 	 * servers this needs to be solved differently.
539 	 * We do take care of PMTU discovery (RFC1191) special case :
540 	 * we can receive locally generated ICMP messages while socket is held.
541 	 */
542 	if (sock_owned_by_user(sk)) {
543 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
544 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
545 	}
546 	if (sk->sk_state == TCP_CLOSE)
547 		goto out;
548 
549 	if (static_branch_unlikely(&ip4_min_ttl)) {
550 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
551 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
552 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
553 			goto out;
554 		}
555 	}
556 
557 	tp = tcp_sk(sk);
558 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
559 	fastopen = rcu_dereference(tp->fastopen_rsk);
560 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
561 	if (sk->sk_state != TCP_LISTEN &&
562 	    !between(seq, snd_una, tp->snd_nxt)) {
563 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
564 		goto out;
565 	}
566 
567 	switch (type) {
568 	case ICMP_REDIRECT:
569 		if (!sock_owned_by_user(sk))
570 			do_redirect(skb, sk);
571 		goto out;
572 	case ICMP_SOURCE_QUENCH:
573 		/* Just silently ignore these. */
574 		goto out;
575 	case ICMP_PARAMETERPROB:
576 		err = EPROTO;
577 		break;
578 	case ICMP_DEST_UNREACH:
579 		if (code > NR_ICMP_UNREACH)
580 			goto out;
581 
582 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
583 			/* We are not interested in TCP_LISTEN and open_requests
584 			 * (SYN-ACKs send out by Linux are always <576bytes so
585 			 * they should go through unfragmented).
586 			 */
587 			if (sk->sk_state == TCP_LISTEN)
588 				goto out;
589 
590 			WRITE_ONCE(tp->mtu_info, info);
591 			if (!sock_owned_by_user(sk)) {
592 				tcp_v4_mtu_reduced(sk);
593 			} else {
594 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
595 					sock_hold(sk);
596 			}
597 			goto out;
598 		}
599 
600 		err = icmp_err_convert[code].errno;
601 		/* check if this ICMP message allows revert of backoff.
602 		 * (see RFC 6069)
603 		 */
604 		if (!fastopen &&
605 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
606 			tcp_ld_RTO_revert(sk, seq);
607 		break;
608 	case ICMP_TIME_EXCEEDED:
609 		err = EHOSTUNREACH;
610 		break;
611 	default:
612 		goto out;
613 	}
614 
615 	switch (sk->sk_state) {
616 	case TCP_SYN_SENT:
617 	case TCP_SYN_RECV:
618 		/* Only in fast or simultaneous open. If a fast open socket is
619 		 * already accepted it is treated as a connected one below.
620 		 */
621 		if (fastopen && !fastopen->sk)
622 			break;
623 
624 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
625 
626 		if (!sock_owned_by_user(sk))
627 			tcp_done_with_error(sk, err);
628 		else
629 			WRITE_ONCE(sk->sk_err_soft, err);
630 		goto out;
631 	}
632 
633 	/* If we've already connected we will keep trying
634 	 * until we time out, or the user gives up.
635 	 *
636 	 * rfc1122 4.2.3.9 allows to consider as hard errors
637 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
638 	 * but it is obsoleted by pmtu discovery).
639 	 *
640 	 * Note, that in modern internet, where routing is unreliable
641 	 * and in each dark corner broken firewalls sit, sending random
642 	 * errors ordered by their masters even this two messages finally lose
643 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
644 	 *
645 	 * Now we are in compliance with RFCs.
646 	 *							--ANK (980905)
647 	 */
648 
649 	if (!sock_owned_by_user(sk) &&
650 	    inet_test_bit(RECVERR, sk)) {
651 		WRITE_ONCE(sk->sk_err, err);
652 		sk_error_report(sk);
653 	} else	{ /* Only an error on timeout */
654 		WRITE_ONCE(sk->sk_err_soft, err);
655 	}
656 
657 out:
658 	bh_unlock_sock(sk);
659 	sock_put(sk);
660 	return 0;
661 }
662 
663 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
664 {
665 	struct tcphdr *th = tcp_hdr(skb);
666 
667 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
668 	skb->csum_start = skb_transport_header(skb) - skb->head;
669 	skb->csum_offset = offsetof(struct tcphdr, check);
670 }
671 
672 /* This routine computes an IPv4 TCP checksum. */
673 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
674 {
675 	const struct inet_sock *inet = inet_sk(sk);
676 
677 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
678 }
679 EXPORT_IPV6_MOD(tcp_v4_send_check);
680 
681 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
682 
683 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
684 				 const struct tcp_ao_hdr *aoh,
685 				 struct ip_reply_arg *arg, struct tcphdr *reply,
686 				 __be32 reply_options[REPLY_OPTIONS_LEN])
687 {
688 #ifdef CONFIG_TCP_AO
689 	int sdif = tcp_v4_sdif(skb);
690 	int dif = inet_iif(skb);
691 	int l3index = sdif ? dif : 0;
692 	bool allocated_traffic_key;
693 	struct tcp_ao_key *key;
694 	char *traffic_key;
695 	bool drop = true;
696 	u32 ao_sne = 0;
697 	u8 keyid;
698 
699 	rcu_read_lock();
700 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
701 				 &key, &traffic_key, &allocated_traffic_key,
702 				 &keyid, &ao_sne))
703 		goto out;
704 
705 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
706 				 (aoh->rnext_keyid << 8) | keyid);
707 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
708 	reply->doff = arg->iov[0].iov_len / 4;
709 
710 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
711 			    key, traffic_key,
712 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
713 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
714 			    reply, ao_sne))
715 		goto out;
716 	drop = false;
717 out:
718 	rcu_read_unlock();
719 	if (allocated_traffic_key)
720 		kfree(traffic_key);
721 	return drop;
722 #else
723 	return true;
724 #endif
725 }
726 
727 /*
728  *	This routine will send an RST to the other tcp.
729  *
730  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
731  *		      for reset.
732  *	Answer: if a packet caused RST, it is not for a socket
733  *		existing in our system, if it is matched to a socket,
734  *		it is just duplicate segment or bug in other side's TCP.
735  *		So that we build reply only basing on parameters
736  *		arrived with segment.
737  *	Exception: precedence violation. We do not implement it in any case.
738  */
739 
740 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
741 			      enum sk_rst_reason reason)
742 {
743 	const struct tcphdr *th = tcp_hdr(skb);
744 	struct {
745 		struct tcphdr th;
746 		__be32 opt[REPLY_OPTIONS_LEN];
747 	} rep;
748 	const __u8 *md5_hash_location = NULL;
749 	const struct tcp_ao_hdr *aoh;
750 	struct ip_reply_arg arg;
751 #ifdef CONFIG_TCP_MD5SIG
752 	struct tcp_md5sig_key *key = NULL;
753 	unsigned char newhash[16];
754 	struct sock *sk1 = NULL;
755 	int genhash;
756 #endif
757 	u64 transmit_time = 0;
758 	struct sock *ctl_sk;
759 	struct net *net;
760 	u32 txhash = 0;
761 
762 	/* Never send a reset in response to a reset. */
763 	if (th->rst)
764 		return;
765 
766 	/* If sk not NULL, it means we did a successful lookup and incoming
767 	 * route had to be correct. prequeue might have dropped our dst.
768 	 */
769 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
770 		return;
771 
772 	/* Swap the send and the receive. */
773 	memset(&rep, 0, sizeof(rep));
774 	rep.th.dest   = th->source;
775 	rep.th.source = th->dest;
776 	rep.th.doff   = sizeof(struct tcphdr) / 4;
777 	rep.th.rst    = 1;
778 
779 	if (th->ack) {
780 		rep.th.seq = th->ack_seq;
781 	} else {
782 		rep.th.ack = 1;
783 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
784 				       skb->len - (th->doff << 2));
785 	}
786 
787 	memset(&arg, 0, sizeof(arg));
788 	arg.iov[0].iov_base = (unsigned char *)&rep;
789 	arg.iov[0].iov_len  = sizeof(rep.th);
790 
791 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
792 
793 	/* Invalid TCP option size or twice included auth */
794 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
795 		return;
796 
797 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
798 		return;
799 
800 #ifdef CONFIG_TCP_MD5SIG
801 	rcu_read_lock();
802 	if (sk && sk_fullsock(sk)) {
803 		const union tcp_md5_addr *addr;
804 		int l3index;
805 
806 		/* sdif set, means packet ingressed via a device
807 		 * in an L3 domain and inet_iif is set to it.
808 		 */
809 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
810 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
811 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
812 	} else if (md5_hash_location) {
813 		const union tcp_md5_addr *addr;
814 		int sdif = tcp_v4_sdif(skb);
815 		int dif = inet_iif(skb);
816 		int l3index;
817 
818 		/*
819 		 * active side is lost. Try to find listening socket through
820 		 * source port, and then find md5 key through listening socket.
821 		 * we are not loose security here:
822 		 * Incoming packet is checked with md5 hash with finding key,
823 		 * no RST generated if md5 hash doesn't match.
824 		 */
825 		sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
826 					     th->source, ip_hdr(skb)->daddr,
827 					     ntohs(th->source), dif, sdif);
828 		/* don't send rst if it can't find key */
829 		if (!sk1)
830 			goto out;
831 
832 		/* sdif set, means packet ingressed via a device
833 		 * in an L3 domain and dif is set to it.
834 		 */
835 		l3index = sdif ? dif : 0;
836 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
837 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
838 		if (!key)
839 			goto out;
840 
841 
842 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
843 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
844 			goto out;
845 
846 	}
847 
848 	if (key) {
849 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
850 				   (TCPOPT_NOP << 16) |
851 				   (TCPOPT_MD5SIG << 8) |
852 				   TCPOLEN_MD5SIG);
853 		/* Update length and the length the header thinks exists */
854 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
855 		rep.th.doff = arg.iov[0].iov_len / 4;
856 
857 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
858 				     key, ip_hdr(skb)->saddr,
859 				     ip_hdr(skb)->daddr, &rep.th);
860 	}
861 #endif
862 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
863 	if (rep.opt[0] == 0) {
864 		__be32 mrst = mptcp_reset_option(skb);
865 
866 		if (mrst) {
867 			rep.opt[0] = mrst;
868 			arg.iov[0].iov_len += sizeof(mrst);
869 			rep.th.doff = arg.iov[0].iov_len / 4;
870 		}
871 	}
872 
873 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
874 				      ip_hdr(skb)->saddr, /* XXX */
875 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
876 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
877 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
878 
879 	/* When socket is gone, all binding information is lost.
880 	 * routing might fail in this case. No choice here, if we choose to force
881 	 * input interface, we will misroute in case of asymmetric route.
882 	 */
883 	if (sk)
884 		arg.bound_dev_if = sk->sk_bound_dev_if;
885 
886 	trace_tcp_send_reset(sk, skb, reason);
887 
888 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
889 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
890 
891 	/* ECN bits of TW reset are cleared */
892 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
893 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
894 	local_bh_disable();
895 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
896 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
897 
898 	sock_net_set(ctl_sk, net);
899 	if (sk) {
900 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
901 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
902 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
903 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
904 		transmit_time = tcp_transmit_time(sk);
905 		xfrm_sk_clone_policy(ctl_sk, sk);
906 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
907 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
908 	} else {
909 		ctl_sk->sk_mark = 0;
910 		ctl_sk->sk_priority = 0;
911 	}
912 	ip_send_unicast_reply(ctl_sk, sk,
913 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
914 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
915 			      &arg, arg.iov[0].iov_len,
916 			      transmit_time, txhash);
917 
918 	xfrm_sk_free_policy(ctl_sk);
919 	sock_net_set(ctl_sk, &init_net);
920 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
921 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
922 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
923 	local_bh_enable();
924 
925 #ifdef CONFIG_TCP_MD5SIG
926 out:
927 	rcu_read_unlock();
928 #endif
929 }
930 
931 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
932    outside socket context is ugly, certainly. What can I do?
933  */
934 
935 static void tcp_v4_send_ack(const struct sock *sk,
936 			    struct sk_buff *skb, u32 seq, u32 ack,
937 			    u32 win, u32 tsval, u32 tsecr, int oif,
938 			    struct tcp_key *key,
939 			    int reply_flags, u8 tos, u32 txhash)
940 {
941 	const struct tcphdr *th = tcp_hdr(skb);
942 	struct {
943 		struct tcphdr th;
944 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
945 	} rep;
946 	struct net *net = sock_net(sk);
947 	struct ip_reply_arg arg;
948 	struct sock *ctl_sk;
949 	u64 transmit_time;
950 
951 	memset(&rep.th, 0, sizeof(struct tcphdr));
952 	memset(&arg, 0, sizeof(arg));
953 
954 	arg.iov[0].iov_base = (unsigned char *)&rep;
955 	arg.iov[0].iov_len  = sizeof(rep.th);
956 	if (tsecr) {
957 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
958 				   (TCPOPT_TIMESTAMP << 8) |
959 				   TCPOLEN_TIMESTAMP);
960 		rep.opt[1] = htonl(tsval);
961 		rep.opt[2] = htonl(tsecr);
962 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
963 	}
964 
965 	/* Swap the send and the receive. */
966 	rep.th.dest    = th->source;
967 	rep.th.source  = th->dest;
968 	rep.th.doff    = arg.iov[0].iov_len / 4;
969 	rep.th.seq     = htonl(seq);
970 	rep.th.ack_seq = htonl(ack);
971 	rep.th.ack     = 1;
972 	rep.th.window  = htons(win);
973 
974 #ifdef CONFIG_TCP_MD5SIG
975 	if (tcp_key_is_md5(key)) {
976 		int offset = (tsecr) ? 3 : 0;
977 
978 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
979 					  (TCPOPT_NOP << 16) |
980 					  (TCPOPT_MD5SIG << 8) |
981 					  TCPOLEN_MD5SIG);
982 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
983 		rep.th.doff = arg.iov[0].iov_len/4;
984 
985 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
986 				    key->md5_key, ip_hdr(skb)->saddr,
987 				    ip_hdr(skb)->daddr, &rep.th);
988 	}
989 #endif
990 #ifdef CONFIG_TCP_AO
991 	if (tcp_key_is_ao(key)) {
992 		int offset = (tsecr) ? 3 : 0;
993 
994 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
995 					  (tcp_ao_len(key->ao_key) << 16) |
996 					  (key->ao_key->sndid << 8) |
997 					  key->rcv_next);
998 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
999 		rep.th.doff = arg.iov[0].iov_len / 4;
1000 
1001 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1002 				key->ao_key, key->traffic_key,
1003 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1004 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1005 				&rep.th, key->sne);
1006 	}
1007 #endif
1008 	arg.flags = reply_flags;
1009 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1010 				      ip_hdr(skb)->saddr, /* XXX */
1011 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1012 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1013 	if (oif)
1014 		arg.bound_dev_if = oif;
1015 	arg.tos = tos;
1016 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1017 	local_bh_disable();
1018 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1019 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1020 	sock_net_set(ctl_sk, net);
1021 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1022 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1023 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1024 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1025 	transmit_time = tcp_transmit_time(sk);
1026 	ip_send_unicast_reply(ctl_sk, sk,
1027 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1028 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1029 			      &arg, arg.iov[0].iov_len,
1030 			      transmit_time, txhash);
1031 
1032 	sock_net_set(ctl_sk, &init_net);
1033 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1034 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1035 	local_bh_enable();
1036 }
1037 
1038 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1039 				enum tcp_tw_status tw_status)
1040 {
1041 	struct inet_timewait_sock *tw = inet_twsk(sk);
1042 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1043 	struct tcp_key key = {};
1044 	u8 tos = tw->tw_tos;
1045 
1046 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1047 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1048 	 * being placed in a different service queues (Classic rather than L4S)
1049 	 */
1050 	if (tw_status == TCP_TW_ACK_OOW)
1051 		tos &= ~INET_ECN_MASK;
1052 
1053 #ifdef CONFIG_TCP_AO
1054 	struct tcp_ao_info *ao_info;
1055 
1056 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1057 		/* FIXME: the segment to-be-acked is not verified yet */
1058 		ao_info = rcu_dereference(tcptw->ao_info);
1059 		if (ao_info) {
1060 			const struct tcp_ao_hdr *aoh;
1061 
1062 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1063 				inet_twsk_put(tw);
1064 				return;
1065 			}
1066 
1067 			if (aoh)
1068 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1069 								    aoh->rnext_keyid, -1);
1070 		}
1071 	}
1072 	if (key.ao_key) {
1073 		struct tcp_ao_key *rnext_key;
1074 
1075 		key.traffic_key = snd_other_key(key.ao_key);
1076 		key.sne = READ_ONCE(ao_info->snd_sne);
1077 		rnext_key = READ_ONCE(ao_info->rnext_key);
1078 		key.rcv_next = rnext_key->rcvid;
1079 		key.type = TCP_KEY_AO;
1080 #else
1081 	if (0) {
1082 #endif
1083 	} else if (static_branch_tcp_md5()) {
1084 		key.md5_key = tcp_twsk_md5_key(tcptw);
1085 		if (key.md5_key)
1086 			key.type = TCP_KEY_MD5;
1087 	}
1088 
1089 	tcp_v4_send_ack(sk, skb,
1090 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1091 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1092 			tcp_tw_tsval(tcptw),
1093 			READ_ONCE(tcptw->tw_ts_recent),
1094 			tw->tw_bound_dev_if, &key,
1095 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1096 			tos,
1097 			tw->tw_txhash);
1098 
1099 	inet_twsk_put(tw);
1100 }
1101 
1102 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1103 				  struct request_sock *req)
1104 {
1105 	struct tcp_key key = {};
1106 
1107 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1108 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1109 	 */
1110 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1111 					     tcp_sk(sk)->snd_nxt;
1112 
1113 #ifdef CONFIG_TCP_AO
1114 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1115 	    tcp_rsk_used_ao(req)) {
1116 		const union tcp_md5_addr *addr;
1117 		const struct tcp_ao_hdr *aoh;
1118 		int l3index;
1119 
1120 		/* Invalid TCP option size or twice included auth */
1121 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1122 			return;
1123 		if (!aoh)
1124 			return;
1125 
1126 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1127 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1128 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1129 					      aoh->rnext_keyid, -1);
1130 		if (unlikely(!key.ao_key)) {
1131 			/* Send ACK with any matching MKT for the peer */
1132 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1133 			/* Matching key disappeared (user removed the key?)
1134 			 * let the handshake timeout.
1135 			 */
1136 			if (!key.ao_key) {
1137 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1138 						     addr,
1139 						     ntohs(tcp_hdr(skb)->source),
1140 						     &ip_hdr(skb)->daddr,
1141 						     ntohs(tcp_hdr(skb)->dest));
1142 				return;
1143 			}
1144 		}
1145 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1146 		if (!key.traffic_key)
1147 			return;
1148 
1149 		key.type = TCP_KEY_AO;
1150 		key.rcv_next = aoh->keyid;
1151 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1152 #else
1153 	if (0) {
1154 #endif
1155 	} else if (static_branch_tcp_md5()) {
1156 		const union tcp_md5_addr *addr;
1157 		int l3index;
1158 
1159 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1160 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1161 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1162 		if (key.md5_key)
1163 			key.type = TCP_KEY_MD5;
1164 	}
1165 
1166 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1167 	tcp_v4_send_ack(sk, skb, seq,
1168 			tcp_rsk(req)->rcv_nxt,
1169 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1170 			tcp_rsk_tsval(tcp_rsk(req)),
1171 			req->ts_recent,
1172 			0, &key,
1173 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1174 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1175 			READ_ONCE(tcp_rsk(req)->txhash));
1176 	if (tcp_key_is_ao(&key))
1177 		kfree(key.traffic_key);
1178 }
1179 
1180 /*
1181  *	Send a SYN-ACK after having received a SYN.
1182  *	This still operates on a request_sock only, not on a big
1183  *	socket.
1184  */
1185 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1186 			      struct flowi *fl,
1187 			      struct request_sock *req,
1188 			      struct tcp_fastopen_cookie *foc,
1189 			      enum tcp_synack_type synack_type,
1190 			      struct sk_buff *syn_skb)
1191 {
1192 	const struct inet_request_sock *ireq = inet_rsk(req);
1193 	struct flowi4 fl4;
1194 	int err = -1;
1195 	struct sk_buff *skb;
1196 	u8 tos;
1197 
1198 	/* First, grab a route. */
1199 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1200 		return -1;
1201 
1202 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1203 
1204 	if (skb) {
1205 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1206 
1207 		tos = READ_ONCE(inet_sk(sk)->tos);
1208 
1209 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1210 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1211 			      (tos & INET_ECN_MASK);
1212 
1213 		if (!INET_ECN_is_capable(tos) &&
1214 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1215 			tos |= INET_ECN_ECT_0;
1216 
1217 		rcu_read_lock();
1218 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1219 					    ireq->ir_rmt_addr,
1220 					    rcu_dereference(ireq->ireq_opt),
1221 					    tos);
1222 		rcu_read_unlock();
1223 		err = net_xmit_eval(err);
1224 	}
1225 
1226 	return err;
1227 }
1228 
1229 /*
1230  *	IPv4 request_sock destructor.
1231  */
1232 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1233 {
1234 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1235 }
1236 
1237 #ifdef CONFIG_TCP_MD5SIG
1238 /*
1239  * RFC2385 MD5 checksumming requires a mapping of
1240  * IP address->MD5 Key.
1241  * We need to maintain these in the sk structure.
1242  */
1243 
1244 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1245 EXPORT_IPV6_MOD(tcp_md5_needed);
1246 
1247 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1248 {
1249 	if (!old)
1250 		return true;
1251 
1252 	/* l3index always overrides non-l3index */
1253 	if (old->l3index && new->l3index == 0)
1254 		return false;
1255 	if (old->l3index == 0 && new->l3index)
1256 		return true;
1257 
1258 	return old->prefixlen < new->prefixlen;
1259 }
1260 
1261 /* Find the Key structure for an address.  */
1262 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1263 					   const union tcp_md5_addr *addr,
1264 					   int family, bool any_l3index)
1265 {
1266 	const struct tcp_sock *tp = tcp_sk(sk);
1267 	struct tcp_md5sig_key *key;
1268 	const struct tcp_md5sig_info *md5sig;
1269 	__be32 mask;
1270 	struct tcp_md5sig_key *best_match = NULL;
1271 	bool match;
1272 
1273 	/* caller either holds rcu_read_lock() or socket lock */
1274 	md5sig = rcu_dereference_check(tp->md5sig_info,
1275 				       lockdep_sock_is_held(sk));
1276 	if (!md5sig)
1277 		return NULL;
1278 
1279 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1280 				 lockdep_sock_is_held(sk)) {
1281 		if (key->family != family)
1282 			continue;
1283 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1284 		    key->l3index != l3index)
1285 			continue;
1286 		if (family == AF_INET) {
1287 			mask = inet_make_mask(key->prefixlen);
1288 			match = (key->addr.a4.s_addr & mask) ==
1289 				(addr->a4.s_addr & mask);
1290 #if IS_ENABLED(CONFIG_IPV6)
1291 		} else if (family == AF_INET6) {
1292 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1293 						  key->prefixlen);
1294 #endif
1295 		} else {
1296 			match = false;
1297 		}
1298 
1299 		if (match && better_md5_match(best_match, key))
1300 			best_match = key;
1301 	}
1302 	return best_match;
1303 }
1304 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1305 
1306 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1307 						      const union tcp_md5_addr *addr,
1308 						      int family, u8 prefixlen,
1309 						      int l3index, u8 flags)
1310 {
1311 	const struct tcp_sock *tp = tcp_sk(sk);
1312 	struct tcp_md5sig_key *key;
1313 	unsigned int size = sizeof(struct in_addr);
1314 	const struct tcp_md5sig_info *md5sig;
1315 
1316 	/* caller either holds rcu_read_lock() or socket lock */
1317 	md5sig = rcu_dereference_check(tp->md5sig_info,
1318 				       lockdep_sock_is_held(sk));
1319 	if (!md5sig)
1320 		return NULL;
1321 #if IS_ENABLED(CONFIG_IPV6)
1322 	if (family == AF_INET6)
1323 		size = sizeof(struct in6_addr);
1324 #endif
1325 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1326 				 lockdep_sock_is_held(sk)) {
1327 		if (key->family != family)
1328 			continue;
1329 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1330 			continue;
1331 		if (key->l3index != l3index)
1332 			continue;
1333 		if (!memcmp(&key->addr, addr, size) &&
1334 		    key->prefixlen == prefixlen)
1335 			return key;
1336 	}
1337 	return NULL;
1338 }
1339 
1340 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1341 					 const struct sock *addr_sk)
1342 {
1343 	const union tcp_md5_addr *addr;
1344 	int l3index;
1345 
1346 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1347 						 addr_sk->sk_bound_dev_if);
1348 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1349 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1350 }
1351 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1352 
1353 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1354 {
1355 	struct tcp_sock *tp = tcp_sk(sk);
1356 	struct tcp_md5sig_info *md5sig;
1357 
1358 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1359 	if (!md5sig)
1360 		return -ENOMEM;
1361 
1362 	sk_gso_disable(sk);
1363 	INIT_HLIST_HEAD(&md5sig->head);
1364 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1365 	return 0;
1366 }
1367 
1368 /* This can be called on a newly created socket, from other files */
1369 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1370 			    int family, u8 prefixlen, int l3index, u8 flags,
1371 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1372 {
1373 	/* Add Key to the list */
1374 	struct tcp_md5sig_key *key;
1375 	struct tcp_sock *tp = tcp_sk(sk);
1376 	struct tcp_md5sig_info *md5sig;
1377 
1378 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1379 	if (key) {
1380 		/* Pre-existing entry - just update that one.
1381 		 * Note that the key might be used concurrently.
1382 		 * data_race() is telling kcsan that we do not care of
1383 		 * key mismatches, since changing MD5 key on live flows
1384 		 * can lead to packet drops.
1385 		 */
1386 		data_race(memcpy(key->key, newkey, newkeylen));
1387 
1388 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1389 		 * Also note that a reader could catch new key->keylen value
1390 		 * but old key->key[], this is the reason we use __GFP_ZERO
1391 		 * at sock_kmalloc() time below these lines.
1392 		 */
1393 		WRITE_ONCE(key->keylen, newkeylen);
1394 
1395 		return 0;
1396 	}
1397 
1398 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1399 					   lockdep_sock_is_held(sk));
1400 
1401 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1402 	if (!key)
1403 		return -ENOMEM;
1404 
1405 	memcpy(key->key, newkey, newkeylen);
1406 	key->keylen = newkeylen;
1407 	key->family = family;
1408 	key->prefixlen = prefixlen;
1409 	key->l3index = l3index;
1410 	key->flags = flags;
1411 	memcpy(&key->addr, addr,
1412 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1413 								 sizeof(struct in_addr));
1414 	hlist_add_head_rcu(&key->node, &md5sig->head);
1415 	return 0;
1416 }
1417 
1418 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1419 		   int family, u8 prefixlen, int l3index, u8 flags,
1420 		   const u8 *newkey, u8 newkeylen)
1421 {
1422 	struct tcp_sock *tp = tcp_sk(sk);
1423 
1424 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1425 		if (tcp_md5_alloc_sigpool())
1426 			return -ENOMEM;
1427 
1428 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1429 			tcp_md5_release_sigpool();
1430 			return -ENOMEM;
1431 		}
1432 
1433 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1434 			struct tcp_md5sig_info *md5sig;
1435 
1436 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1437 			rcu_assign_pointer(tp->md5sig_info, NULL);
1438 			kfree_rcu(md5sig, rcu);
1439 			tcp_md5_release_sigpool();
1440 			return -EUSERS;
1441 		}
1442 	}
1443 
1444 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1445 				newkey, newkeylen, GFP_KERNEL);
1446 }
1447 EXPORT_IPV6_MOD(tcp_md5_do_add);
1448 
1449 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1450 		     int family, u8 prefixlen, int l3index,
1451 		     struct tcp_md5sig_key *key)
1452 {
1453 	struct tcp_sock *tp = tcp_sk(sk);
1454 
1455 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1456 		tcp_md5_add_sigpool();
1457 
1458 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1459 			tcp_md5_release_sigpool();
1460 			return -ENOMEM;
1461 		}
1462 
1463 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1464 			struct tcp_md5sig_info *md5sig;
1465 
1466 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1467 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1468 			rcu_assign_pointer(tp->md5sig_info, NULL);
1469 			kfree_rcu(md5sig, rcu);
1470 			tcp_md5_release_sigpool();
1471 			return -EUSERS;
1472 		}
1473 	}
1474 
1475 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1476 				key->flags, key->key, key->keylen,
1477 				sk_gfp_mask(sk, GFP_ATOMIC));
1478 }
1479 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1480 
1481 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1482 		   u8 prefixlen, int l3index, u8 flags)
1483 {
1484 	struct tcp_md5sig_key *key;
1485 
1486 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1487 	if (!key)
1488 		return -ENOENT;
1489 	hlist_del_rcu(&key->node);
1490 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1491 	kfree_rcu(key, rcu);
1492 	return 0;
1493 }
1494 EXPORT_IPV6_MOD(tcp_md5_do_del);
1495 
1496 void tcp_clear_md5_list(struct sock *sk)
1497 {
1498 	struct tcp_sock *tp = tcp_sk(sk);
1499 	struct tcp_md5sig_key *key;
1500 	struct hlist_node *n;
1501 	struct tcp_md5sig_info *md5sig;
1502 
1503 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1504 
1505 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1506 		hlist_del_rcu(&key->node);
1507 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1508 		kfree_rcu(key, rcu);
1509 	}
1510 }
1511 
1512 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1513 				 sockptr_t optval, int optlen)
1514 {
1515 	struct tcp_md5sig cmd;
1516 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1517 	const union tcp_md5_addr *addr;
1518 	u8 prefixlen = 32;
1519 	int l3index = 0;
1520 	bool l3flag;
1521 	u8 flags;
1522 
1523 	if (optlen < sizeof(cmd))
1524 		return -EINVAL;
1525 
1526 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1527 		return -EFAULT;
1528 
1529 	if (sin->sin_family != AF_INET)
1530 		return -EINVAL;
1531 
1532 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1533 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1534 
1535 	if (optname == TCP_MD5SIG_EXT &&
1536 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1537 		prefixlen = cmd.tcpm_prefixlen;
1538 		if (prefixlen > 32)
1539 			return -EINVAL;
1540 	}
1541 
1542 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1543 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1544 		struct net_device *dev;
1545 
1546 		rcu_read_lock();
1547 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1548 		if (dev && netif_is_l3_master(dev))
1549 			l3index = dev->ifindex;
1550 
1551 		rcu_read_unlock();
1552 
1553 		/* ok to reference set/not set outside of rcu;
1554 		 * right now device MUST be an L3 master
1555 		 */
1556 		if (!dev || !l3index)
1557 			return -EINVAL;
1558 	}
1559 
1560 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1561 
1562 	if (!cmd.tcpm_keylen)
1563 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1564 
1565 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1566 		return -EINVAL;
1567 
1568 	/* Don't allow keys for peers that have a matching TCP-AO key.
1569 	 * See the comment in tcp_ao_add_cmd()
1570 	 */
1571 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1572 		return -EKEYREJECTED;
1573 
1574 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1575 			      cmd.tcpm_key, cmd.tcpm_keylen);
1576 }
1577 
1578 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1579 				   __be32 daddr, __be32 saddr,
1580 				   const struct tcphdr *th, int nbytes)
1581 {
1582 	struct tcp4_pseudohdr *bp;
1583 	struct scatterlist sg;
1584 	struct tcphdr *_th;
1585 
1586 	bp = hp->scratch;
1587 	bp->saddr = saddr;
1588 	bp->daddr = daddr;
1589 	bp->pad = 0;
1590 	bp->protocol = IPPROTO_TCP;
1591 	bp->len = cpu_to_be16(nbytes);
1592 
1593 	_th = (struct tcphdr *)(bp + 1);
1594 	memcpy(_th, th, sizeof(*th));
1595 	_th->check = 0;
1596 
1597 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1598 	ahash_request_set_crypt(hp->req, &sg, NULL,
1599 				sizeof(*bp) + sizeof(*th));
1600 	return crypto_ahash_update(hp->req);
1601 }
1602 
1603 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1604 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1605 {
1606 	struct tcp_sigpool hp;
1607 
1608 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1609 		goto clear_hash_nostart;
1610 
1611 	if (crypto_ahash_init(hp.req))
1612 		goto clear_hash;
1613 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1614 		goto clear_hash;
1615 	if (tcp_md5_hash_key(&hp, key))
1616 		goto clear_hash;
1617 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1618 	if (crypto_ahash_final(hp.req))
1619 		goto clear_hash;
1620 
1621 	tcp_sigpool_end(&hp);
1622 	return 0;
1623 
1624 clear_hash:
1625 	tcp_sigpool_end(&hp);
1626 clear_hash_nostart:
1627 	memset(md5_hash, 0, 16);
1628 	return 1;
1629 }
1630 
1631 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1632 			const struct sock *sk,
1633 			const struct sk_buff *skb)
1634 {
1635 	const struct tcphdr *th = tcp_hdr(skb);
1636 	struct tcp_sigpool hp;
1637 	__be32 saddr, daddr;
1638 
1639 	if (sk) { /* valid for establish/request sockets */
1640 		saddr = sk->sk_rcv_saddr;
1641 		daddr = sk->sk_daddr;
1642 	} else {
1643 		const struct iphdr *iph = ip_hdr(skb);
1644 		saddr = iph->saddr;
1645 		daddr = iph->daddr;
1646 	}
1647 
1648 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1649 		goto clear_hash_nostart;
1650 
1651 	if (crypto_ahash_init(hp.req))
1652 		goto clear_hash;
1653 
1654 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1655 		goto clear_hash;
1656 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1657 		goto clear_hash;
1658 	if (tcp_md5_hash_key(&hp, key))
1659 		goto clear_hash;
1660 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1661 	if (crypto_ahash_final(hp.req))
1662 		goto clear_hash;
1663 
1664 	tcp_sigpool_end(&hp);
1665 	return 0;
1666 
1667 clear_hash:
1668 	tcp_sigpool_end(&hp);
1669 clear_hash_nostart:
1670 	memset(md5_hash, 0, 16);
1671 	return 1;
1672 }
1673 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1674 
1675 #endif
1676 
1677 static void tcp_v4_init_req(struct request_sock *req,
1678 			    const struct sock *sk_listener,
1679 			    struct sk_buff *skb)
1680 {
1681 	struct inet_request_sock *ireq = inet_rsk(req);
1682 	struct net *net = sock_net(sk_listener);
1683 
1684 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1685 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1686 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1687 }
1688 
1689 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1690 					  struct sk_buff *skb,
1691 					  struct flowi *fl,
1692 					  struct request_sock *req,
1693 					  u32 tw_isn)
1694 {
1695 	tcp_v4_init_req(req, sk, skb);
1696 
1697 	if (security_inet_conn_request(sk, skb, req))
1698 		return NULL;
1699 
1700 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1701 }
1702 
1703 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1704 	.family		=	PF_INET,
1705 	.obj_size	=	sizeof(struct tcp_request_sock),
1706 	.send_ack	=	tcp_v4_reqsk_send_ack,
1707 	.destructor	=	tcp_v4_reqsk_destructor,
1708 	.send_reset	=	tcp_v4_send_reset,
1709 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1710 };
1711 
1712 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1713 	.mss_clamp	=	TCP_MSS_DEFAULT,
1714 #ifdef CONFIG_TCP_MD5SIG
1715 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1716 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1717 #endif
1718 #ifdef CONFIG_TCP_AO
1719 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1720 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1721 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1722 #endif
1723 #ifdef CONFIG_SYN_COOKIES
1724 	.cookie_init_seq =	cookie_v4_init_sequence,
1725 #endif
1726 	.route_req	=	tcp_v4_route_req,
1727 	.init_seq	=	tcp_v4_init_seq,
1728 	.init_ts_off	=	tcp_v4_init_ts_off,
1729 	.send_synack	=	tcp_v4_send_synack,
1730 };
1731 
1732 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1733 {
1734 	/* Never answer to SYNs send to broadcast or multicast */
1735 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1736 		goto drop;
1737 
1738 	return tcp_conn_request(&tcp_request_sock_ops,
1739 				&tcp_request_sock_ipv4_ops, sk, skb);
1740 
1741 drop:
1742 	tcp_listendrop(sk);
1743 	return 0;
1744 }
1745 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1746 
1747 
1748 /*
1749  * The three way handshake has completed - we got a valid synack -
1750  * now create the new socket.
1751  */
1752 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1753 				  struct request_sock *req,
1754 				  struct dst_entry *dst,
1755 				  struct request_sock *req_unhash,
1756 				  bool *own_req)
1757 {
1758 	struct inet_request_sock *ireq;
1759 	bool found_dup_sk = false;
1760 	struct inet_sock *newinet;
1761 	struct tcp_sock *newtp;
1762 	struct sock *newsk;
1763 #ifdef CONFIG_TCP_MD5SIG
1764 	const union tcp_md5_addr *addr;
1765 	struct tcp_md5sig_key *key;
1766 	int l3index;
1767 #endif
1768 	struct ip_options_rcu *inet_opt;
1769 
1770 	if (sk_acceptq_is_full(sk))
1771 		goto exit_overflow;
1772 
1773 	newsk = tcp_create_openreq_child(sk, req, skb);
1774 	if (!newsk)
1775 		goto exit_nonewsk;
1776 
1777 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1778 	inet_sk_rx_dst_set(newsk, skb);
1779 
1780 	newtp		      = tcp_sk(newsk);
1781 	newinet		      = inet_sk(newsk);
1782 	ireq		      = inet_rsk(req);
1783 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1784 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1785 	newinet->mc_index     = inet_iif(skb);
1786 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1787 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1788 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1789 	if (inet_opt)
1790 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1791 	atomic_set(&newinet->inet_id, get_random_u16());
1792 
1793 	/* Set ToS of the new socket based upon the value of incoming SYN.
1794 	 * ECT bits are set later in tcp_init_transfer().
1795 	 */
1796 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1797 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1798 
1799 	if (!dst) {
1800 		dst = inet_csk_route_child_sock(sk, newsk, req);
1801 		if (!dst)
1802 			goto put_and_exit;
1803 	} else {
1804 		/* syncookie case : see end of cookie_v4_check() */
1805 	}
1806 	sk_setup_caps(newsk, dst);
1807 
1808 	tcp_ca_openreq_child(newsk, dst);
1809 
1810 	tcp_sync_mss(newsk, dst_mtu(dst));
1811 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1812 
1813 	tcp_initialize_rcv_mss(newsk);
1814 
1815 #ifdef CONFIG_TCP_MD5SIG
1816 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1817 	/* Copy over the MD5 key from the original socket */
1818 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1819 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1820 	if (key && !tcp_rsk_used_ao(req)) {
1821 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1822 			goto put_and_exit;
1823 		sk_gso_disable(newsk);
1824 	}
1825 #endif
1826 #ifdef CONFIG_TCP_AO
1827 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1828 		goto put_and_exit; /* OOM, release back memory */
1829 #endif
1830 
1831 	if (__inet_inherit_port(sk, newsk) < 0)
1832 		goto put_and_exit;
1833 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1834 				       &found_dup_sk);
1835 	if (likely(*own_req)) {
1836 		tcp_move_syn(newtp, req);
1837 		ireq->ireq_opt = NULL;
1838 	} else {
1839 		newinet->inet_opt = NULL;
1840 
1841 		if (!req_unhash && found_dup_sk) {
1842 			/* This code path should only be executed in the
1843 			 * syncookie case only
1844 			 */
1845 			bh_unlock_sock(newsk);
1846 			sock_put(newsk);
1847 			newsk = NULL;
1848 		}
1849 	}
1850 	return newsk;
1851 
1852 exit_overflow:
1853 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1854 exit_nonewsk:
1855 	dst_release(dst);
1856 exit:
1857 	tcp_listendrop(sk);
1858 	return NULL;
1859 put_and_exit:
1860 	newinet->inet_opt = NULL;
1861 	inet_csk_prepare_forced_close(newsk);
1862 	tcp_done(newsk);
1863 	goto exit;
1864 }
1865 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1866 
1867 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1868 {
1869 #ifdef CONFIG_SYN_COOKIES
1870 	const struct tcphdr *th = tcp_hdr(skb);
1871 
1872 	if (!th->syn)
1873 		sk = cookie_v4_check(sk, skb);
1874 #endif
1875 	return sk;
1876 }
1877 
1878 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1879 			 struct tcphdr *th, u32 *cookie)
1880 {
1881 	u16 mss = 0;
1882 #ifdef CONFIG_SYN_COOKIES
1883 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1884 				    &tcp_request_sock_ipv4_ops, sk, th);
1885 	if (mss) {
1886 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1887 		tcp_synq_overflow(sk);
1888 	}
1889 #endif
1890 	return mss;
1891 }
1892 
1893 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1894 							   u32));
1895 /* The socket must have it's spinlock held when we get
1896  * here, unless it is a TCP_LISTEN socket.
1897  *
1898  * We have a potential double-lock case here, so even when
1899  * doing backlog processing we use the BH locking scheme.
1900  * This is because we cannot sleep with the original spinlock
1901  * held.
1902  */
1903 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1904 {
1905 	enum skb_drop_reason reason;
1906 	struct sock *rsk;
1907 
1908 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1909 		struct dst_entry *dst;
1910 
1911 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1912 						lockdep_sock_is_held(sk));
1913 
1914 		sock_rps_save_rxhash(sk, skb);
1915 		sk_mark_napi_id(sk, skb);
1916 		if (dst) {
1917 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1918 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1919 					     dst, 0)) {
1920 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1921 				dst_release(dst);
1922 			}
1923 		}
1924 		tcp_rcv_established(sk, skb);
1925 		return 0;
1926 	}
1927 
1928 	if (tcp_checksum_complete(skb))
1929 		goto csum_err;
1930 
1931 	if (sk->sk_state == TCP_LISTEN) {
1932 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1933 
1934 		if (!nsk)
1935 			return 0;
1936 		if (nsk != sk) {
1937 			reason = tcp_child_process(sk, nsk, skb);
1938 			if (reason) {
1939 				rsk = nsk;
1940 				goto reset;
1941 			}
1942 			return 0;
1943 		}
1944 	} else
1945 		sock_rps_save_rxhash(sk, skb);
1946 
1947 	reason = tcp_rcv_state_process(sk, skb);
1948 	if (reason) {
1949 		rsk = sk;
1950 		goto reset;
1951 	}
1952 	return 0;
1953 
1954 reset:
1955 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1956 discard:
1957 	sk_skb_reason_drop(sk, skb, reason);
1958 	/* Be careful here. If this function gets more complicated and
1959 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1960 	 * might be destroyed here. This current version compiles correctly,
1961 	 * but you have been warned.
1962 	 */
1963 	return 0;
1964 
1965 csum_err:
1966 	reason = SKB_DROP_REASON_TCP_CSUM;
1967 	trace_tcp_bad_csum(skb);
1968 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1969 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1970 	goto discard;
1971 }
1972 EXPORT_SYMBOL(tcp_v4_do_rcv);
1973 
1974 int tcp_v4_early_demux(struct sk_buff *skb)
1975 {
1976 	struct net *net = dev_net_rcu(skb->dev);
1977 	const struct iphdr *iph;
1978 	const struct tcphdr *th;
1979 	struct sock *sk;
1980 
1981 	if (skb->pkt_type != PACKET_HOST)
1982 		return 0;
1983 
1984 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1985 		return 0;
1986 
1987 	iph = ip_hdr(skb);
1988 	th = tcp_hdr(skb);
1989 
1990 	if (th->doff < sizeof(struct tcphdr) / 4)
1991 		return 0;
1992 
1993 	sk = __inet_lookup_established(net, iph->saddr, th->source,
1994 				       iph->daddr, ntohs(th->dest),
1995 				       skb->skb_iif, inet_sdif(skb));
1996 	if (sk) {
1997 		skb->sk = sk;
1998 		skb->destructor = sock_edemux;
1999 		if (sk_fullsock(sk)) {
2000 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2001 
2002 			if (dst)
2003 				dst = dst_check(dst, 0);
2004 			if (dst &&
2005 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
2006 				skb_dst_set_noref(skb, dst);
2007 		}
2008 	}
2009 	return 0;
2010 }
2011 
2012 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2013 		     enum skb_drop_reason *reason)
2014 {
2015 	u32 tail_gso_size, tail_gso_segs;
2016 	struct skb_shared_info *shinfo;
2017 	const struct tcphdr *th;
2018 	struct tcphdr *thtail;
2019 	struct sk_buff *tail;
2020 	unsigned int hdrlen;
2021 	bool fragstolen;
2022 	u32 gso_segs;
2023 	u32 gso_size;
2024 	u64 limit;
2025 	int delta;
2026 	int err;
2027 
2028 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2029 	 * we can fix skb->truesize to its real value to avoid future drops.
2030 	 * This is valid because skb is not yet charged to the socket.
2031 	 * It has been noticed pure SACK packets were sometimes dropped
2032 	 * (if cooked by drivers without copybreak feature).
2033 	 */
2034 	skb_condense(skb);
2035 
2036 	tcp_cleanup_skb(skb);
2037 
2038 	if (unlikely(tcp_checksum_complete(skb))) {
2039 		bh_unlock_sock(sk);
2040 		trace_tcp_bad_csum(skb);
2041 		*reason = SKB_DROP_REASON_TCP_CSUM;
2042 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2043 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2044 		return true;
2045 	}
2046 
2047 	/* Attempt coalescing to last skb in backlog, even if we are
2048 	 * above the limits.
2049 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2050 	 */
2051 	th = (const struct tcphdr *)skb->data;
2052 	hdrlen = th->doff * 4;
2053 
2054 	tail = sk->sk_backlog.tail;
2055 	if (!tail)
2056 		goto no_coalesce;
2057 	thtail = (struct tcphdr *)tail->data;
2058 
2059 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2060 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2061 	    ((TCP_SKB_CB(tail)->tcp_flags |
2062 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2063 	    !((TCP_SKB_CB(tail)->tcp_flags &
2064 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2065 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2066 	      TCP_SKB_CB(skb)->tcp_flags) &
2067 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2068 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2069 	    thtail->doff != th->doff ||
2070 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2071 		goto no_coalesce;
2072 
2073 	__skb_pull(skb, hdrlen);
2074 
2075 	shinfo = skb_shinfo(skb);
2076 	gso_size = shinfo->gso_size ?: skb->len;
2077 	gso_segs = shinfo->gso_segs ?: 1;
2078 
2079 	shinfo = skb_shinfo(tail);
2080 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2081 	tail_gso_segs = shinfo->gso_segs ?: 1;
2082 
2083 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2084 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2085 
2086 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2087 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2088 			thtail->window = th->window;
2089 		}
2090 
2091 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2092 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2093 		 * is not entered if we append a packet with a FIN.
2094 		 * SYN, RST, URG are not present.
2095 		 * ACK is set on both packets.
2096 		 * PSH : we do not really care in TCP stack,
2097 		 *       at least for 'GRO' packets.
2098 		 */
2099 		thtail->fin |= th->fin;
2100 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2101 
2102 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2103 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2104 			tail->tstamp = skb->tstamp;
2105 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2106 		}
2107 
2108 		/* Not as strict as GRO. We only need to carry mss max value */
2109 		shinfo->gso_size = max(gso_size, tail_gso_size);
2110 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2111 
2112 		sk->sk_backlog.len += delta;
2113 		__NET_INC_STATS(sock_net(sk),
2114 				LINUX_MIB_TCPBACKLOGCOALESCE);
2115 		kfree_skb_partial(skb, fragstolen);
2116 		return false;
2117 	}
2118 	__skb_push(skb, hdrlen);
2119 
2120 no_coalesce:
2121 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2122 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2123 	 * sk_rcvbuf in normal conditions.
2124 	 */
2125 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2126 
2127 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2128 
2129 	/* Only socket owner can try to collapse/prune rx queues
2130 	 * to reduce memory overhead, so add a little headroom here.
2131 	 * Few sockets backlog are possibly concurrently non empty.
2132 	 */
2133 	limit += 64 * 1024;
2134 
2135 	limit = min_t(u64, limit, UINT_MAX);
2136 
2137 	err = sk_add_backlog(sk, skb, limit);
2138 	if (unlikely(err)) {
2139 		bh_unlock_sock(sk);
2140 		if (err == -ENOMEM) {
2141 			*reason = SKB_DROP_REASON_PFMEMALLOC;
2142 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2143 		} else {
2144 			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2145 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2146 		}
2147 		return true;
2148 	}
2149 	return false;
2150 }
2151 EXPORT_IPV6_MOD(tcp_add_backlog);
2152 
2153 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
2154 {
2155 	struct tcphdr *th = (struct tcphdr *)skb->data;
2156 
2157 	return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
2158 }
2159 EXPORT_IPV6_MOD(tcp_filter);
2160 
2161 static void tcp_v4_restore_cb(struct sk_buff *skb)
2162 {
2163 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2164 		sizeof(struct inet_skb_parm));
2165 }
2166 
2167 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2168 			   const struct tcphdr *th)
2169 {
2170 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2171 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2172 	 */
2173 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2174 		sizeof(struct inet_skb_parm));
2175 	barrier();
2176 
2177 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2178 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2179 				    skb->len - th->doff * 4);
2180 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2181 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2182 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2183 	TCP_SKB_CB(skb)->sacked	 = 0;
2184 	TCP_SKB_CB(skb)->has_rxtstamp =
2185 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2186 }
2187 
2188 /*
2189  *	From tcp_input.c
2190  */
2191 
2192 int tcp_v4_rcv(struct sk_buff *skb)
2193 {
2194 	struct net *net = dev_net_rcu(skb->dev);
2195 	enum skb_drop_reason drop_reason;
2196 	enum tcp_tw_status tw_status;
2197 	int sdif = inet_sdif(skb);
2198 	int dif = inet_iif(skb);
2199 	const struct iphdr *iph;
2200 	const struct tcphdr *th;
2201 	struct sock *sk = NULL;
2202 	bool refcounted;
2203 	int ret;
2204 	u32 isn;
2205 
2206 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2207 	if (skb->pkt_type != PACKET_HOST)
2208 		goto discard_it;
2209 
2210 	/* Count it even if it's bad */
2211 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2212 
2213 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2214 		goto discard_it;
2215 
2216 	th = (const struct tcphdr *)skb->data;
2217 
2218 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2219 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2220 		goto bad_packet;
2221 	}
2222 	if (!pskb_may_pull(skb, th->doff * 4))
2223 		goto discard_it;
2224 
2225 	/* An explanation is required here, I think.
2226 	 * Packet length and doff are validated by header prediction,
2227 	 * provided case of th->doff==0 is eliminated.
2228 	 * So, we defer the checks. */
2229 
2230 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2231 		goto csum_error;
2232 
2233 	th = (const struct tcphdr *)skb->data;
2234 	iph = ip_hdr(skb);
2235 lookup:
2236 	sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2237 			       th->dest, sdif, &refcounted);
2238 	if (!sk)
2239 		goto no_tcp_socket;
2240 
2241 	if (sk->sk_state == TCP_TIME_WAIT)
2242 		goto do_time_wait;
2243 
2244 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2245 		struct request_sock *req = inet_reqsk(sk);
2246 		bool req_stolen = false;
2247 		struct sock *nsk;
2248 
2249 		sk = req->rsk_listener;
2250 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2251 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2252 		else
2253 			drop_reason = tcp_inbound_hash(sk, req, skb,
2254 						       &iph->saddr, &iph->daddr,
2255 						       AF_INET, dif, sdif);
2256 		if (unlikely(drop_reason)) {
2257 			sk_drops_skbadd(sk, skb);
2258 			reqsk_put(req);
2259 			goto discard_it;
2260 		}
2261 		if (tcp_checksum_complete(skb)) {
2262 			reqsk_put(req);
2263 			goto csum_error;
2264 		}
2265 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2266 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2267 			if (!nsk) {
2268 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2269 				goto lookup;
2270 			}
2271 			sk = nsk;
2272 			/* reuseport_migrate_sock() has already held one sk_refcnt
2273 			 * before returning.
2274 			 */
2275 		} else {
2276 			/* We own a reference on the listener, increase it again
2277 			 * as we might lose it too soon.
2278 			 */
2279 			sock_hold(sk);
2280 		}
2281 		refcounted = true;
2282 		nsk = NULL;
2283 		if (!tcp_filter(sk, skb, &drop_reason)) {
2284 			th = (const struct tcphdr *)skb->data;
2285 			iph = ip_hdr(skb);
2286 			tcp_v4_fill_cb(skb, iph, th);
2287 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2288 					    &drop_reason);
2289 		}
2290 		if (!nsk) {
2291 			reqsk_put(req);
2292 			if (req_stolen) {
2293 				/* Another cpu got exclusive access to req
2294 				 * and created a full blown socket.
2295 				 * Try to feed this packet to this socket
2296 				 * instead of discarding it.
2297 				 */
2298 				tcp_v4_restore_cb(skb);
2299 				sock_put(sk);
2300 				goto lookup;
2301 			}
2302 			goto discard_and_relse;
2303 		}
2304 		nf_reset_ct(skb);
2305 		if (nsk == sk) {
2306 			reqsk_put(req);
2307 			tcp_v4_restore_cb(skb);
2308 		} else {
2309 			drop_reason = tcp_child_process(sk, nsk, skb);
2310 			if (drop_reason) {
2311 				enum sk_rst_reason rst_reason;
2312 
2313 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2314 				tcp_v4_send_reset(nsk, skb, rst_reason);
2315 				goto discard_and_relse;
2316 			}
2317 			sock_put(sk);
2318 			return 0;
2319 		}
2320 	}
2321 
2322 process:
2323 	if (static_branch_unlikely(&ip4_min_ttl)) {
2324 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2325 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2326 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2327 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2328 			goto discard_and_relse;
2329 		}
2330 	}
2331 
2332 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2333 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2334 		goto discard_and_relse;
2335 	}
2336 
2337 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2338 				       AF_INET, dif, sdif);
2339 	if (drop_reason)
2340 		goto discard_and_relse;
2341 
2342 	nf_reset_ct(skb);
2343 
2344 	if (tcp_filter(sk, skb, &drop_reason))
2345 		goto discard_and_relse;
2346 
2347 	th = (const struct tcphdr *)skb->data;
2348 	iph = ip_hdr(skb);
2349 	tcp_v4_fill_cb(skb, iph, th);
2350 
2351 	skb->dev = NULL;
2352 
2353 	if (sk->sk_state == TCP_LISTEN) {
2354 		ret = tcp_v4_do_rcv(sk, skb);
2355 		goto put_and_return;
2356 	}
2357 
2358 	sk_incoming_cpu_update(sk);
2359 
2360 	bh_lock_sock_nested(sk);
2361 	tcp_segs_in(tcp_sk(sk), skb);
2362 	ret = 0;
2363 	if (!sock_owned_by_user(sk)) {
2364 		ret = tcp_v4_do_rcv(sk, skb);
2365 	} else {
2366 		if (tcp_add_backlog(sk, skb, &drop_reason))
2367 			goto discard_and_relse;
2368 	}
2369 	bh_unlock_sock(sk);
2370 
2371 put_and_return:
2372 	if (refcounted)
2373 		sock_put(sk);
2374 
2375 	return ret;
2376 
2377 no_tcp_socket:
2378 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2379 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2380 		goto discard_it;
2381 
2382 	tcp_v4_fill_cb(skb, iph, th);
2383 
2384 	if (tcp_checksum_complete(skb)) {
2385 csum_error:
2386 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2387 		trace_tcp_bad_csum(skb);
2388 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2389 bad_packet:
2390 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2391 	} else {
2392 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2393 	}
2394 
2395 discard_it:
2396 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2397 	/* Discard frame. */
2398 	sk_skb_reason_drop(sk, skb, drop_reason);
2399 	return 0;
2400 
2401 discard_and_relse:
2402 	sk_drops_skbadd(sk, skb);
2403 	if (refcounted)
2404 		sock_put(sk);
2405 	goto discard_it;
2406 
2407 do_time_wait:
2408 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2409 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2410 		inet_twsk_put(inet_twsk(sk));
2411 		goto discard_it;
2412 	}
2413 
2414 	tcp_v4_fill_cb(skb, iph, th);
2415 
2416 	if (tcp_checksum_complete(skb)) {
2417 		inet_twsk_put(inet_twsk(sk));
2418 		goto csum_error;
2419 	}
2420 
2421 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2422 					       &drop_reason);
2423 	switch (tw_status) {
2424 	case TCP_TW_SYN: {
2425 		struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2426 							iph->saddr, th->source,
2427 							iph->daddr, th->dest,
2428 							inet_iif(skb),
2429 							sdif);
2430 		if (sk2) {
2431 			inet_twsk_deschedule_put(inet_twsk(sk));
2432 			sk = sk2;
2433 			tcp_v4_restore_cb(skb);
2434 			refcounted = false;
2435 			__this_cpu_write(tcp_tw_isn, isn);
2436 			goto process;
2437 		}
2438 	}
2439 		/* to ACK */
2440 		fallthrough;
2441 	case TCP_TW_ACK:
2442 	case TCP_TW_ACK_OOW:
2443 		tcp_v4_timewait_ack(sk, skb, tw_status);
2444 		break;
2445 	case TCP_TW_RST:
2446 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2447 		inet_twsk_deschedule_put(inet_twsk(sk));
2448 		goto discard_it;
2449 	case TCP_TW_SUCCESS:;
2450 	}
2451 	goto discard_it;
2452 }
2453 
2454 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2455 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2456 };
2457 
2458 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2459 {
2460 	struct dst_entry *dst = skb_dst(skb);
2461 
2462 	if (dst && dst_hold_safe(dst)) {
2463 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2464 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2465 	}
2466 }
2467 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2468 
2469 const struct inet_connection_sock_af_ops ipv4_specific = {
2470 	.queue_xmit	   = ip_queue_xmit,
2471 	.send_check	   = tcp_v4_send_check,
2472 	.rebuild_header	   = inet_sk_rebuild_header,
2473 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2474 	.conn_request	   = tcp_v4_conn_request,
2475 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2476 	.net_header_len	   = sizeof(struct iphdr),
2477 	.setsockopt	   = ip_setsockopt,
2478 	.getsockopt	   = ip_getsockopt,
2479 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2480 };
2481 EXPORT_IPV6_MOD(ipv4_specific);
2482 
2483 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2484 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2485 #ifdef CONFIG_TCP_MD5SIG
2486 	.md5_lookup		= tcp_v4_md5_lookup,
2487 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2488 	.md5_parse		= tcp_v4_parse_md5_keys,
2489 #endif
2490 #ifdef CONFIG_TCP_AO
2491 	.ao_lookup		= tcp_v4_ao_lookup,
2492 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2493 	.ao_parse		= tcp_v4_parse_ao,
2494 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2495 #endif
2496 };
2497 #endif
2498 
2499 /* NOTE: A lot of things set to zero explicitly by call to
2500  *       sk_alloc() so need not be done here.
2501  */
2502 static int tcp_v4_init_sock(struct sock *sk)
2503 {
2504 	struct inet_connection_sock *icsk = inet_csk(sk);
2505 
2506 	tcp_init_sock(sk);
2507 
2508 	icsk->icsk_af_ops = &ipv4_specific;
2509 
2510 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2511 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2512 #endif
2513 
2514 	return 0;
2515 }
2516 
2517 #ifdef CONFIG_TCP_MD5SIG
2518 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2519 {
2520 	struct tcp_md5sig_info *md5sig;
2521 
2522 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2523 	kfree(md5sig);
2524 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2525 	tcp_md5_release_sigpool();
2526 }
2527 #endif
2528 
2529 static void tcp_release_user_frags(struct sock *sk)
2530 {
2531 #ifdef CONFIG_PAGE_POOL
2532 	unsigned long index;
2533 	void *netmem;
2534 
2535 	xa_for_each(&sk->sk_user_frags, index, netmem)
2536 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2537 #endif
2538 }
2539 
2540 void tcp_v4_destroy_sock(struct sock *sk)
2541 {
2542 	struct tcp_sock *tp = tcp_sk(sk);
2543 
2544 	tcp_release_user_frags(sk);
2545 
2546 	xa_destroy(&sk->sk_user_frags);
2547 
2548 	trace_tcp_destroy_sock(sk);
2549 
2550 	tcp_clear_xmit_timers(sk);
2551 
2552 	tcp_cleanup_congestion_control(sk);
2553 
2554 	tcp_cleanup_ulp(sk);
2555 
2556 	/* Cleanup up the write buffer. */
2557 	tcp_write_queue_purge(sk);
2558 
2559 	/* Check if we want to disable active TFO */
2560 	tcp_fastopen_active_disable_ofo_check(sk);
2561 
2562 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2563 	skb_rbtree_purge(&tp->out_of_order_queue);
2564 
2565 #ifdef CONFIG_TCP_MD5SIG
2566 	/* Clean up the MD5 key list, if any */
2567 	if (tp->md5sig_info) {
2568 		struct tcp_md5sig_info *md5sig;
2569 
2570 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2571 		tcp_clear_md5_list(sk);
2572 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2573 		rcu_assign_pointer(tp->md5sig_info, NULL);
2574 	}
2575 #endif
2576 	tcp_ao_destroy_sock(sk, false);
2577 
2578 	/* Clean up a referenced TCP bind bucket. */
2579 	if (inet_csk(sk)->icsk_bind_hash)
2580 		inet_put_port(sk);
2581 
2582 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2583 
2584 	/* If socket is aborted during connect operation */
2585 	tcp_free_fastopen_req(tp);
2586 	tcp_fastopen_destroy_cipher(sk);
2587 	tcp_saved_syn_free(tp);
2588 
2589 	sk_sockets_allocated_dec(sk);
2590 }
2591 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2592 
2593 #ifdef CONFIG_PROC_FS
2594 /* Proc filesystem TCP sock list dumping. */
2595 
2596 static unsigned short seq_file_family(const struct seq_file *seq);
2597 
2598 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2599 {
2600 	unsigned short family = seq_file_family(seq);
2601 
2602 	/* AF_UNSPEC is used as a match all */
2603 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2604 		net_eq(sock_net(sk), seq_file_net(seq)));
2605 }
2606 
2607 /* Find a non empty bucket (starting from st->bucket)
2608  * and return the first sk from it.
2609  */
2610 static void *listening_get_first(struct seq_file *seq)
2611 {
2612 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2613 	struct tcp_iter_state *st = seq->private;
2614 
2615 	st->offset = 0;
2616 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2617 		struct inet_listen_hashbucket *ilb2;
2618 		struct hlist_nulls_node *node;
2619 		struct sock *sk;
2620 
2621 		ilb2 = &hinfo->lhash2[st->bucket];
2622 		if (hlist_nulls_empty(&ilb2->nulls_head))
2623 			continue;
2624 
2625 		spin_lock(&ilb2->lock);
2626 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2627 			if (seq_sk_match(seq, sk))
2628 				return sk;
2629 		}
2630 		spin_unlock(&ilb2->lock);
2631 	}
2632 
2633 	return NULL;
2634 }
2635 
2636 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2637  * If "cur" is the last one in the st->bucket,
2638  * call listening_get_first() to return the first sk of the next
2639  * non empty bucket.
2640  */
2641 static void *listening_get_next(struct seq_file *seq, void *cur)
2642 {
2643 	struct tcp_iter_state *st = seq->private;
2644 	struct inet_listen_hashbucket *ilb2;
2645 	struct hlist_nulls_node *node;
2646 	struct inet_hashinfo *hinfo;
2647 	struct sock *sk = cur;
2648 
2649 	++st->num;
2650 	++st->offset;
2651 
2652 	sk = sk_nulls_next(sk);
2653 	sk_nulls_for_each_from(sk, node) {
2654 		if (seq_sk_match(seq, sk))
2655 			return sk;
2656 	}
2657 
2658 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2659 	ilb2 = &hinfo->lhash2[st->bucket];
2660 	spin_unlock(&ilb2->lock);
2661 	++st->bucket;
2662 	return listening_get_first(seq);
2663 }
2664 
2665 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2666 {
2667 	struct tcp_iter_state *st = seq->private;
2668 	void *rc;
2669 
2670 	st->bucket = 0;
2671 	st->offset = 0;
2672 	rc = listening_get_first(seq);
2673 
2674 	while (rc && *pos) {
2675 		rc = listening_get_next(seq, rc);
2676 		--*pos;
2677 	}
2678 	return rc;
2679 }
2680 
2681 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2682 				const struct tcp_iter_state *st)
2683 {
2684 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2685 }
2686 
2687 /*
2688  * Get first established socket starting from bucket given in st->bucket.
2689  * If st->bucket is zero, the very first socket in the hash is returned.
2690  */
2691 static void *established_get_first(struct seq_file *seq)
2692 {
2693 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2694 	struct tcp_iter_state *st = seq->private;
2695 
2696 	st->offset = 0;
2697 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2698 		struct sock *sk;
2699 		struct hlist_nulls_node *node;
2700 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2701 
2702 		cond_resched();
2703 
2704 		/* Lockless fast path for the common case of empty buckets */
2705 		if (empty_bucket(hinfo, st))
2706 			continue;
2707 
2708 		spin_lock_bh(lock);
2709 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2710 			if (seq_sk_match(seq, sk))
2711 				return sk;
2712 		}
2713 		spin_unlock_bh(lock);
2714 	}
2715 
2716 	return NULL;
2717 }
2718 
2719 static void *established_get_next(struct seq_file *seq, void *cur)
2720 {
2721 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2722 	struct tcp_iter_state *st = seq->private;
2723 	struct hlist_nulls_node *node;
2724 	struct sock *sk = cur;
2725 
2726 	++st->num;
2727 	++st->offset;
2728 
2729 	sk = sk_nulls_next(sk);
2730 
2731 	sk_nulls_for_each_from(sk, node) {
2732 		if (seq_sk_match(seq, sk))
2733 			return sk;
2734 	}
2735 
2736 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2737 	++st->bucket;
2738 	return established_get_first(seq);
2739 }
2740 
2741 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2742 {
2743 	struct tcp_iter_state *st = seq->private;
2744 	void *rc;
2745 
2746 	st->bucket = 0;
2747 	rc = established_get_first(seq);
2748 
2749 	while (rc && pos) {
2750 		rc = established_get_next(seq, rc);
2751 		--pos;
2752 	}
2753 	return rc;
2754 }
2755 
2756 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2757 {
2758 	void *rc;
2759 	struct tcp_iter_state *st = seq->private;
2760 
2761 	st->state = TCP_SEQ_STATE_LISTENING;
2762 	rc	  = listening_get_idx(seq, &pos);
2763 
2764 	if (!rc) {
2765 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2766 		rc	  = established_get_idx(seq, pos);
2767 	}
2768 
2769 	return rc;
2770 }
2771 
2772 static void *tcp_seek_last_pos(struct seq_file *seq)
2773 {
2774 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2775 	struct tcp_iter_state *st = seq->private;
2776 	int bucket = st->bucket;
2777 	int offset = st->offset;
2778 	int orig_num = st->num;
2779 	void *rc = NULL;
2780 
2781 	switch (st->state) {
2782 	case TCP_SEQ_STATE_LISTENING:
2783 		if (st->bucket > hinfo->lhash2_mask)
2784 			break;
2785 		rc = listening_get_first(seq);
2786 		while (offset-- && rc && bucket == st->bucket)
2787 			rc = listening_get_next(seq, rc);
2788 		if (rc)
2789 			break;
2790 		st->bucket = 0;
2791 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2792 		fallthrough;
2793 	case TCP_SEQ_STATE_ESTABLISHED:
2794 		if (st->bucket > hinfo->ehash_mask)
2795 			break;
2796 		rc = established_get_first(seq);
2797 		while (offset-- && rc && bucket == st->bucket)
2798 			rc = established_get_next(seq, rc);
2799 	}
2800 
2801 	st->num = orig_num;
2802 
2803 	return rc;
2804 }
2805 
2806 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2807 {
2808 	struct tcp_iter_state *st = seq->private;
2809 	void *rc;
2810 
2811 	if (*pos && *pos == st->last_pos) {
2812 		rc = tcp_seek_last_pos(seq);
2813 		if (rc)
2814 			goto out;
2815 	}
2816 
2817 	st->state = TCP_SEQ_STATE_LISTENING;
2818 	st->num = 0;
2819 	st->bucket = 0;
2820 	st->offset = 0;
2821 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2822 
2823 out:
2824 	st->last_pos = *pos;
2825 	return rc;
2826 }
2827 EXPORT_IPV6_MOD(tcp_seq_start);
2828 
2829 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2830 {
2831 	struct tcp_iter_state *st = seq->private;
2832 	void *rc = NULL;
2833 
2834 	if (v == SEQ_START_TOKEN) {
2835 		rc = tcp_get_idx(seq, 0);
2836 		goto out;
2837 	}
2838 
2839 	switch (st->state) {
2840 	case TCP_SEQ_STATE_LISTENING:
2841 		rc = listening_get_next(seq, v);
2842 		if (!rc) {
2843 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2844 			st->bucket = 0;
2845 			st->offset = 0;
2846 			rc	  = established_get_first(seq);
2847 		}
2848 		break;
2849 	case TCP_SEQ_STATE_ESTABLISHED:
2850 		rc = established_get_next(seq, v);
2851 		break;
2852 	}
2853 out:
2854 	++*pos;
2855 	st->last_pos = *pos;
2856 	return rc;
2857 }
2858 EXPORT_IPV6_MOD(tcp_seq_next);
2859 
2860 void tcp_seq_stop(struct seq_file *seq, void *v)
2861 {
2862 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2863 	struct tcp_iter_state *st = seq->private;
2864 
2865 	switch (st->state) {
2866 	case TCP_SEQ_STATE_LISTENING:
2867 		if (v != SEQ_START_TOKEN)
2868 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2869 		break;
2870 	case TCP_SEQ_STATE_ESTABLISHED:
2871 		if (v)
2872 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2873 		break;
2874 	}
2875 }
2876 EXPORT_IPV6_MOD(tcp_seq_stop);
2877 
2878 static void get_openreq4(const struct request_sock *req,
2879 			 struct seq_file *f, int i)
2880 {
2881 	const struct inet_request_sock *ireq = inet_rsk(req);
2882 	long delta = req->rsk_timer.expires - jiffies;
2883 
2884 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2885 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2886 		i,
2887 		ireq->ir_loc_addr,
2888 		ireq->ir_num,
2889 		ireq->ir_rmt_addr,
2890 		ntohs(ireq->ir_rmt_port),
2891 		TCP_SYN_RECV,
2892 		0, 0, /* could print option size, but that is af dependent. */
2893 		1,    /* timers active (only the expire timer) */
2894 		jiffies_delta_to_clock_t(delta),
2895 		req->num_timeout,
2896 		from_kuid_munged(seq_user_ns(f),
2897 				 sk_uid(req->rsk_listener)),
2898 		0,  /* non standard timer */
2899 		0, /* open_requests have no inode */
2900 		0,
2901 		req);
2902 }
2903 
2904 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2905 {
2906 	int timer_active;
2907 	unsigned long timer_expires;
2908 	const struct tcp_sock *tp = tcp_sk(sk);
2909 	const struct inet_connection_sock *icsk = inet_csk(sk);
2910 	const struct inet_sock *inet = inet_sk(sk);
2911 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2912 	__be32 dest = inet->inet_daddr;
2913 	__be32 src = inet->inet_rcv_saddr;
2914 	__u16 destp = ntohs(inet->inet_dport);
2915 	__u16 srcp = ntohs(inet->inet_sport);
2916 	u8 icsk_pending;
2917 	int rx_queue;
2918 	int state;
2919 
2920 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2921 	if (icsk_pending == ICSK_TIME_RETRANS ||
2922 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2923 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2924 		timer_active	= 1;
2925 		timer_expires	= icsk_timeout(icsk);
2926 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2927 		timer_active	= 4;
2928 		timer_expires	= icsk_timeout(icsk);
2929 	} else if (timer_pending(&sk->sk_timer)) {
2930 		timer_active	= 2;
2931 		timer_expires	= sk->sk_timer.expires;
2932 	} else {
2933 		timer_active	= 0;
2934 		timer_expires = jiffies;
2935 	}
2936 
2937 	state = inet_sk_state_load(sk);
2938 	if (state == TCP_LISTEN)
2939 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2940 	else
2941 		/* Because we don't lock the socket,
2942 		 * we might find a transient negative value.
2943 		 */
2944 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2945 				      READ_ONCE(tp->copied_seq), 0);
2946 
2947 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2948 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2949 		i, src, srcp, dest, destp, state,
2950 		READ_ONCE(tp->write_seq) - tp->snd_una,
2951 		rx_queue,
2952 		timer_active,
2953 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2954 		READ_ONCE(icsk->icsk_retransmits),
2955 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2956 		READ_ONCE(icsk->icsk_probes_out),
2957 		sock_i_ino(sk),
2958 		refcount_read(&sk->sk_refcnt), sk,
2959 		jiffies_to_clock_t(icsk->icsk_rto),
2960 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2961 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2962 		tcp_snd_cwnd(tp),
2963 		state == TCP_LISTEN ?
2964 		    fastopenq->max_qlen :
2965 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2966 }
2967 
2968 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2969 			       struct seq_file *f, int i)
2970 {
2971 	long delta = tw->tw_timer.expires - jiffies;
2972 	__be32 dest, src;
2973 	__u16 destp, srcp;
2974 
2975 	dest  = tw->tw_daddr;
2976 	src   = tw->tw_rcv_saddr;
2977 	destp = ntohs(tw->tw_dport);
2978 	srcp  = ntohs(tw->tw_sport);
2979 
2980 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2981 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2982 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2983 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2984 		refcount_read(&tw->tw_refcnt), tw);
2985 }
2986 
2987 #define TMPSZ 150
2988 
2989 static int tcp4_seq_show(struct seq_file *seq, void *v)
2990 {
2991 	struct tcp_iter_state *st;
2992 	struct sock *sk = v;
2993 
2994 	seq_setwidth(seq, TMPSZ - 1);
2995 	if (v == SEQ_START_TOKEN) {
2996 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2997 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2998 			   "inode");
2999 		goto out;
3000 	}
3001 	st = seq->private;
3002 
3003 	if (sk->sk_state == TCP_TIME_WAIT)
3004 		get_timewait4_sock(v, seq, st->num);
3005 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
3006 		get_openreq4(v, seq, st->num);
3007 	else
3008 		get_tcp4_sock(v, seq, st->num);
3009 out:
3010 	seq_pad(seq, '\n');
3011 	return 0;
3012 }
3013 
3014 #ifdef CONFIG_BPF_SYSCALL
3015 union bpf_tcp_iter_batch_item {
3016 	struct sock *sk;
3017 	__u64 cookie;
3018 };
3019 
3020 struct bpf_tcp_iter_state {
3021 	struct tcp_iter_state state;
3022 	unsigned int cur_sk;
3023 	unsigned int end_sk;
3024 	unsigned int max_sk;
3025 	union bpf_tcp_iter_batch_item *batch;
3026 };
3027 
3028 struct bpf_iter__tcp {
3029 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3030 	__bpf_md_ptr(struct sock_common *, sk_common);
3031 	uid_t uid __aligned(8);
3032 };
3033 
3034 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3035 			     struct sock_common *sk_common, uid_t uid)
3036 {
3037 	struct bpf_iter__tcp ctx;
3038 
3039 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3040 	ctx.meta = meta;
3041 	ctx.sk_common = sk_common;
3042 	ctx.uid = uid;
3043 	return bpf_iter_run_prog(prog, &ctx);
3044 }
3045 
3046 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3047 {
3048 	union bpf_tcp_iter_batch_item *item;
3049 	unsigned int cur_sk = iter->cur_sk;
3050 	__u64 cookie;
3051 
3052 	/* Remember the cookies of the sockets we haven't seen yet, so we can
3053 	 * pick up where we left off next time around.
3054 	 */
3055 	while (cur_sk < iter->end_sk) {
3056 		item = &iter->batch[cur_sk++];
3057 		cookie = sock_gen_cookie(item->sk);
3058 		sock_gen_put(item->sk);
3059 		item->cookie = cookie;
3060 	}
3061 }
3062 
3063 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3064 				      unsigned int new_batch_sz, gfp_t flags)
3065 {
3066 	union bpf_tcp_iter_batch_item *new_batch;
3067 
3068 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3069 			     flags | __GFP_NOWARN);
3070 	if (!new_batch)
3071 		return -ENOMEM;
3072 
3073 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3074 	kvfree(iter->batch);
3075 	iter->batch = new_batch;
3076 	iter->max_sk = new_batch_sz;
3077 
3078 	return 0;
3079 }
3080 
3081 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3082 					       union bpf_tcp_iter_batch_item *cookies,
3083 					       int n_cookies)
3084 {
3085 	struct hlist_nulls_node *node;
3086 	struct sock *sk;
3087 	int i;
3088 
3089 	for (i = 0; i < n_cookies; i++) {
3090 		sk = first_sk;
3091 		sk_nulls_for_each_from(sk, node)
3092 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3093 				return sk;
3094 	}
3095 
3096 	return NULL;
3097 }
3098 
3099 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3100 {
3101 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3102 	struct bpf_tcp_iter_state *iter = seq->private;
3103 	struct tcp_iter_state *st = &iter->state;
3104 	unsigned int find_cookie = iter->cur_sk;
3105 	unsigned int end_cookie = iter->end_sk;
3106 	int resume_bucket = st->bucket;
3107 	struct sock *sk;
3108 
3109 	if (end_cookie && find_cookie == end_cookie)
3110 		++st->bucket;
3111 
3112 	sk = listening_get_first(seq);
3113 	iter->cur_sk = 0;
3114 	iter->end_sk = 0;
3115 
3116 	if (sk && st->bucket == resume_bucket && end_cookie) {
3117 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3118 						end_cookie - find_cookie);
3119 		if (!sk) {
3120 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
3121 			++st->bucket;
3122 			sk = listening_get_first(seq);
3123 		}
3124 	}
3125 
3126 	return sk;
3127 }
3128 
3129 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3130 {
3131 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3132 	struct bpf_tcp_iter_state *iter = seq->private;
3133 	struct tcp_iter_state *st = &iter->state;
3134 	unsigned int find_cookie = iter->cur_sk;
3135 	unsigned int end_cookie = iter->end_sk;
3136 	int resume_bucket = st->bucket;
3137 	struct sock *sk;
3138 
3139 	if (end_cookie && find_cookie == end_cookie)
3140 		++st->bucket;
3141 
3142 	sk = established_get_first(seq);
3143 	iter->cur_sk = 0;
3144 	iter->end_sk = 0;
3145 
3146 	if (sk && st->bucket == resume_bucket && end_cookie) {
3147 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3148 						end_cookie - find_cookie);
3149 		if (!sk) {
3150 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3151 			++st->bucket;
3152 			sk = established_get_first(seq);
3153 		}
3154 	}
3155 
3156 	return sk;
3157 }
3158 
3159 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3160 {
3161 	struct bpf_tcp_iter_state *iter = seq->private;
3162 	struct tcp_iter_state *st = &iter->state;
3163 	struct sock *sk = NULL;
3164 
3165 	switch (st->state) {
3166 	case TCP_SEQ_STATE_LISTENING:
3167 		sk = bpf_iter_tcp_resume_listening(seq);
3168 		if (sk)
3169 			break;
3170 		st->bucket = 0;
3171 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3172 		fallthrough;
3173 	case TCP_SEQ_STATE_ESTABLISHED:
3174 		sk = bpf_iter_tcp_resume_established(seq);
3175 		break;
3176 	}
3177 
3178 	return sk;
3179 }
3180 
3181 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3182 						 struct sock **start_sk)
3183 {
3184 	struct bpf_tcp_iter_state *iter = seq->private;
3185 	struct hlist_nulls_node *node;
3186 	unsigned int expected = 1;
3187 	struct sock *sk;
3188 
3189 	sock_hold(*start_sk);
3190 	iter->batch[iter->end_sk++].sk = *start_sk;
3191 
3192 	sk = sk_nulls_next(*start_sk);
3193 	*start_sk = NULL;
3194 	sk_nulls_for_each_from(sk, node) {
3195 		if (seq_sk_match(seq, sk)) {
3196 			if (iter->end_sk < iter->max_sk) {
3197 				sock_hold(sk);
3198 				iter->batch[iter->end_sk++].sk = sk;
3199 			} else if (!*start_sk) {
3200 				/* Remember where we left off. */
3201 				*start_sk = sk;
3202 			}
3203 			expected++;
3204 		}
3205 	}
3206 
3207 	return expected;
3208 }
3209 
3210 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3211 						   struct sock **start_sk)
3212 {
3213 	struct bpf_tcp_iter_state *iter = seq->private;
3214 	struct hlist_nulls_node *node;
3215 	unsigned int expected = 1;
3216 	struct sock *sk;
3217 
3218 	sock_hold(*start_sk);
3219 	iter->batch[iter->end_sk++].sk = *start_sk;
3220 
3221 	sk = sk_nulls_next(*start_sk);
3222 	*start_sk = NULL;
3223 	sk_nulls_for_each_from(sk, node) {
3224 		if (seq_sk_match(seq, sk)) {
3225 			if (iter->end_sk < iter->max_sk) {
3226 				sock_hold(sk);
3227 				iter->batch[iter->end_sk++].sk = sk;
3228 			} else if (!*start_sk) {
3229 				/* Remember where we left off. */
3230 				*start_sk = sk;
3231 			}
3232 			expected++;
3233 		}
3234 	}
3235 
3236 	return expected;
3237 }
3238 
3239 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3240 					struct sock **start_sk)
3241 {
3242 	struct bpf_tcp_iter_state *iter = seq->private;
3243 	struct tcp_iter_state *st = &iter->state;
3244 
3245 	if (st->state == TCP_SEQ_STATE_LISTENING)
3246 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3247 	else
3248 		return bpf_iter_tcp_established_batch(seq, start_sk);
3249 }
3250 
3251 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3252 {
3253 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3254 	struct bpf_tcp_iter_state *iter = seq->private;
3255 	struct tcp_iter_state *st = &iter->state;
3256 
3257 	if (st->state == TCP_SEQ_STATE_LISTENING)
3258 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3259 	else
3260 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3261 }
3262 
3263 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3264 {
3265 	struct bpf_tcp_iter_state *iter = seq->private;
3266 	unsigned int expected;
3267 	struct sock *sk;
3268 	int err;
3269 
3270 	sk = bpf_iter_tcp_resume(seq);
3271 	if (!sk)
3272 		return NULL; /* Done */
3273 
3274 	expected = bpf_iter_fill_batch(seq, &sk);
3275 	if (likely(iter->end_sk == expected))
3276 		goto done;
3277 
3278 	/* Batch size was too small. */
3279 	bpf_iter_tcp_unlock_bucket(seq);
3280 	bpf_iter_tcp_put_batch(iter);
3281 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3282 					 GFP_USER);
3283 	if (err)
3284 		return ERR_PTR(err);
3285 
3286 	sk = bpf_iter_tcp_resume(seq);
3287 	if (!sk)
3288 		return NULL; /* Done */
3289 
3290 	expected = bpf_iter_fill_batch(seq, &sk);
3291 	if (likely(iter->end_sk == expected))
3292 		goto done;
3293 
3294 	/* Batch size was still too small. Hold onto the lock while we try
3295 	 * again with a larger batch to make sure the current bucket's size
3296 	 * does not change in the meantime.
3297 	 */
3298 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3299 	if (err) {
3300 		bpf_iter_tcp_unlock_bucket(seq);
3301 		return ERR_PTR(err);
3302 	}
3303 
3304 	expected = bpf_iter_fill_batch(seq, &sk);
3305 	WARN_ON_ONCE(iter->end_sk != expected);
3306 done:
3307 	bpf_iter_tcp_unlock_bucket(seq);
3308 	return iter->batch[0].sk;
3309 }
3310 
3311 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3312 {
3313 	/* bpf iter does not support lseek, so it always
3314 	 * continue from where it was stop()-ped.
3315 	 */
3316 	if (*pos)
3317 		return bpf_iter_tcp_batch(seq);
3318 
3319 	return SEQ_START_TOKEN;
3320 }
3321 
3322 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3323 {
3324 	struct bpf_tcp_iter_state *iter = seq->private;
3325 	struct tcp_iter_state *st = &iter->state;
3326 	struct sock *sk;
3327 
3328 	/* Whenever seq_next() is called, the iter->cur_sk is
3329 	 * done with seq_show(), so advance to the next sk in
3330 	 * the batch.
3331 	 */
3332 	if (iter->cur_sk < iter->end_sk) {
3333 		/* Keeping st->num consistent in tcp_iter_state.
3334 		 * bpf_iter_tcp does not use st->num.
3335 		 * meta.seq_num is used instead.
3336 		 */
3337 		st->num++;
3338 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3339 	}
3340 
3341 	if (iter->cur_sk < iter->end_sk)
3342 		sk = iter->batch[iter->cur_sk].sk;
3343 	else
3344 		sk = bpf_iter_tcp_batch(seq);
3345 
3346 	++*pos;
3347 	/* Keeping st->last_pos consistent in tcp_iter_state.
3348 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3349 	 */
3350 	st->last_pos = *pos;
3351 	return sk;
3352 }
3353 
3354 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3355 {
3356 	struct bpf_iter_meta meta;
3357 	struct bpf_prog *prog;
3358 	struct sock *sk = v;
3359 	uid_t uid;
3360 	int ret;
3361 
3362 	if (v == SEQ_START_TOKEN)
3363 		return 0;
3364 
3365 	if (sk_fullsock(sk))
3366 		lock_sock(sk);
3367 
3368 	if (unlikely(sk_unhashed(sk))) {
3369 		ret = SEQ_SKIP;
3370 		goto unlock;
3371 	}
3372 
3373 	if (sk->sk_state == TCP_TIME_WAIT) {
3374 		uid = 0;
3375 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3376 		const struct request_sock *req = v;
3377 
3378 		uid = from_kuid_munged(seq_user_ns(seq),
3379 				       sk_uid(req->rsk_listener));
3380 	} else {
3381 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3382 	}
3383 
3384 	meta.seq = seq;
3385 	prog = bpf_iter_get_info(&meta, false);
3386 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3387 
3388 unlock:
3389 	if (sk_fullsock(sk))
3390 		release_sock(sk);
3391 	return ret;
3392 
3393 }
3394 
3395 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3396 {
3397 	struct bpf_tcp_iter_state *iter = seq->private;
3398 	struct bpf_iter_meta meta;
3399 	struct bpf_prog *prog;
3400 
3401 	if (!v) {
3402 		meta.seq = seq;
3403 		prog = bpf_iter_get_info(&meta, true);
3404 		if (prog)
3405 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3406 	}
3407 
3408 	if (iter->cur_sk < iter->end_sk)
3409 		bpf_iter_tcp_put_batch(iter);
3410 }
3411 
3412 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3413 	.show		= bpf_iter_tcp_seq_show,
3414 	.start		= bpf_iter_tcp_seq_start,
3415 	.next		= bpf_iter_tcp_seq_next,
3416 	.stop		= bpf_iter_tcp_seq_stop,
3417 };
3418 #endif
3419 static unsigned short seq_file_family(const struct seq_file *seq)
3420 {
3421 	const struct tcp_seq_afinfo *afinfo;
3422 
3423 #ifdef CONFIG_BPF_SYSCALL
3424 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3425 	if (seq->op == &bpf_iter_tcp_seq_ops)
3426 		return AF_UNSPEC;
3427 #endif
3428 
3429 	/* Iterated from proc fs */
3430 	afinfo = pde_data(file_inode(seq->file));
3431 	return afinfo->family;
3432 }
3433 
3434 static const struct seq_operations tcp4_seq_ops = {
3435 	.show		= tcp4_seq_show,
3436 	.start		= tcp_seq_start,
3437 	.next		= tcp_seq_next,
3438 	.stop		= tcp_seq_stop,
3439 };
3440 
3441 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3442 	.family		= AF_INET,
3443 };
3444 
3445 static int __net_init tcp4_proc_init_net(struct net *net)
3446 {
3447 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3448 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3449 		return -ENOMEM;
3450 	return 0;
3451 }
3452 
3453 static void __net_exit tcp4_proc_exit_net(struct net *net)
3454 {
3455 	remove_proc_entry("tcp", net->proc_net);
3456 }
3457 
3458 static struct pernet_operations tcp4_net_ops = {
3459 	.init = tcp4_proc_init_net,
3460 	.exit = tcp4_proc_exit_net,
3461 };
3462 
3463 int __init tcp4_proc_init(void)
3464 {
3465 	return register_pernet_subsys(&tcp4_net_ops);
3466 }
3467 
3468 void tcp4_proc_exit(void)
3469 {
3470 	unregister_pernet_subsys(&tcp4_net_ops);
3471 }
3472 #endif /* CONFIG_PROC_FS */
3473 
3474 /* @wake is one when sk_stream_write_space() calls us.
3475  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3476  * This mimics the strategy used in sock_def_write_space().
3477  */
3478 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3479 {
3480 	const struct tcp_sock *tp = tcp_sk(sk);
3481 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3482 			    READ_ONCE(tp->snd_nxt);
3483 
3484 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3485 }
3486 EXPORT_SYMBOL(tcp_stream_memory_free);
3487 
3488 struct proto tcp_prot = {
3489 	.name			= "TCP",
3490 	.owner			= THIS_MODULE,
3491 	.close			= tcp_close,
3492 	.pre_connect		= tcp_v4_pre_connect,
3493 	.connect		= tcp_v4_connect,
3494 	.disconnect		= tcp_disconnect,
3495 	.accept			= inet_csk_accept,
3496 	.ioctl			= tcp_ioctl,
3497 	.init			= tcp_v4_init_sock,
3498 	.destroy		= tcp_v4_destroy_sock,
3499 	.shutdown		= tcp_shutdown,
3500 	.setsockopt		= tcp_setsockopt,
3501 	.getsockopt		= tcp_getsockopt,
3502 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3503 	.keepalive		= tcp_set_keepalive,
3504 	.recvmsg		= tcp_recvmsg,
3505 	.sendmsg		= tcp_sendmsg,
3506 	.splice_eof		= tcp_splice_eof,
3507 	.backlog_rcv		= tcp_v4_do_rcv,
3508 	.release_cb		= tcp_release_cb,
3509 	.hash			= inet_hash,
3510 	.unhash			= inet_unhash,
3511 	.get_port		= inet_csk_get_port,
3512 	.put_port		= inet_put_port,
3513 #ifdef CONFIG_BPF_SYSCALL
3514 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3515 #endif
3516 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3517 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3518 	.stream_memory_free	= tcp_stream_memory_free,
3519 	.sockets_allocated	= &tcp_sockets_allocated,
3520 
3521 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3522 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3523 
3524 	.memory_pressure	= &tcp_memory_pressure,
3525 	.sysctl_mem		= sysctl_tcp_mem,
3526 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3527 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3528 	.max_header		= MAX_TCP_HEADER,
3529 	.obj_size		= sizeof(struct tcp_sock),
3530 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3531 	.twsk_prot		= &tcp_timewait_sock_ops,
3532 	.rsk_prot		= &tcp_request_sock_ops,
3533 	.h.hashinfo		= NULL,
3534 	.no_autobind		= true,
3535 	.diag_destroy		= tcp_abort,
3536 };
3537 EXPORT_SYMBOL(tcp_prot);
3538 
3539 static void __net_exit tcp_sk_exit(struct net *net)
3540 {
3541 	if (net->ipv4.tcp_congestion_control)
3542 		bpf_module_put(net->ipv4.tcp_congestion_control,
3543 			       net->ipv4.tcp_congestion_control->owner);
3544 }
3545 
3546 static void __net_init tcp_set_hashinfo(struct net *net)
3547 {
3548 	struct inet_hashinfo *hinfo;
3549 	unsigned int ehash_entries;
3550 	struct net *old_net;
3551 
3552 	if (net_eq(net, &init_net))
3553 		goto fallback;
3554 
3555 	old_net = current->nsproxy->net_ns;
3556 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3557 	if (!ehash_entries)
3558 		goto fallback;
3559 
3560 	ehash_entries = roundup_pow_of_two(ehash_entries);
3561 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3562 	if (!hinfo) {
3563 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3564 			"for a netns, fallback to the global one\n",
3565 			ehash_entries);
3566 fallback:
3567 		hinfo = &tcp_hashinfo;
3568 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3569 	}
3570 
3571 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3572 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3573 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3574 }
3575 
3576 static int __net_init tcp_sk_init(struct net *net)
3577 {
3578 	net->ipv4.sysctl_tcp_ecn = 2;
3579 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3580 
3581 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3582 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3583 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3584 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3585 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3586 
3587 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3588 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3589 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3590 
3591 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3592 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3593 	net->ipv4.sysctl_tcp_syncookies = 1;
3594 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3595 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3596 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3597 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3598 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3599 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3600 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3601 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3602 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3603 
3604 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3605 	tcp_set_hashinfo(net);
3606 
3607 	net->ipv4.sysctl_tcp_sack = 1;
3608 	net->ipv4.sysctl_tcp_window_scaling = 1;
3609 	net->ipv4.sysctl_tcp_timestamps = 1;
3610 	net->ipv4.sysctl_tcp_early_retrans = 3;
3611 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3612 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3613 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3614 	net->ipv4.sysctl_tcp_max_reordering = 300;
3615 	net->ipv4.sysctl_tcp_dsack = 1;
3616 	net->ipv4.sysctl_tcp_app_win = 31;
3617 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3618 	net->ipv4.sysctl_tcp_frto = 2;
3619 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3620 	/* This limits the percentage of the congestion window which we
3621 	 * will allow a single TSO frame to consume.  Building TSO frames
3622 	 * which are too large can cause TCP streams to be bursty.
3623 	 */
3624 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3625 	/* Default TSQ limit of 4 MB */
3626 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3627 
3628 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3629 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3630 
3631 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3632 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3633 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3634 	net->ipv4.sysctl_tcp_autocorking = 1;
3635 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3636 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3637 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3638 	if (net != &init_net) {
3639 		memcpy(net->ipv4.sysctl_tcp_rmem,
3640 		       init_net.ipv4.sysctl_tcp_rmem,
3641 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3642 		memcpy(net->ipv4.sysctl_tcp_wmem,
3643 		       init_net.ipv4.sysctl_tcp_wmem,
3644 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3645 	}
3646 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3647 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3648 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3649 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3650 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3651 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3652 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3653 
3654 	/* Set default values for PLB */
3655 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3656 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3657 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3658 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3659 	/* Default congestion threshold for PLB to mark a round is 50% */
3660 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3661 
3662 	/* Reno is always built in */
3663 	if (!net_eq(net, &init_net) &&
3664 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3665 			       init_net.ipv4.tcp_congestion_control->owner))
3666 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3667 	else
3668 		net->ipv4.tcp_congestion_control = &tcp_reno;
3669 
3670 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3671 	net->ipv4.sysctl_tcp_shrink_window = 0;
3672 
3673 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3674 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3675 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3676 
3677 	return 0;
3678 }
3679 
3680 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3681 {
3682 	struct net *net;
3683 
3684 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3685 	 * and failed setup_net error unwinding path are serialized.
3686 	 *
3687 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3688 	 * net_exit_list, the thread that dismantles a particular twsk must
3689 	 * do so without other thread progressing to refcount_dec_and_test() of
3690 	 * tcp_death_row.tw_refcount.
3691 	 */
3692 	mutex_lock(&tcp_exit_batch_mutex);
3693 
3694 	tcp_twsk_purge(net_exit_list);
3695 
3696 	list_for_each_entry(net, net_exit_list, exit_list) {
3697 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3698 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3699 		tcp_fastopen_ctx_destroy(net);
3700 	}
3701 
3702 	mutex_unlock(&tcp_exit_batch_mutex);
3703 }
3704 
3705 static struct pernet_operations __net_initdata tcp_sk_ops = {
3706        .init	   = tcp_sk_init,
3707        .exit	   = tcp_sk_exit,
3708        .exit_batch = tcp_sk_exit_batch,
3709 };
3710 
3711 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3712 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3713 		     struct sock_common *sk_common, uid_t uid)
3714 
3715 #define INIT_BATCH_SZ 16
3716 
3717 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3718 {
3719 	struct bpf_tcp_iter_state *iter = priv_data;
3720 	int err;
3721 
3722 	err = bpf_iter_init_seq_net(priv_data, aux);
3723 	if (err)
3724 		return err;
3725 
3726 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3727 	if (err) {
3728 		bpf_iter_fini_seq_net(priv_data);
3729 		return err;
3730 	}
3731 
3732 	return 0;
3733 }
3734 
3735 static void bpf_iter_fini_tcp(void *priv_data)
3736 {
3737 	struct bpf_tcp_iter_state *iter = priv_data;
3738 
3739 	bpf_iter_fini_seq_net(priv_data);
3740 	kvfree(iter->batch);
3741 }
3742 
3743 static const struct bpf_iter_seq_info tcp_seq_info = {
3744 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3745 	.init_seq_private	= bpf_iter_init_tcp,
3746 	.fini_seq_private	= bpf_iter_fini_tcp,
3747 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3748 };
3749 
3750 static const struct bpf_func_proto *
3751 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3752 			    const struct bpf_prog *prog)
3753 {
3754 	switch (func_id) {
3755 	case BPF_FUNC_setsockopt:
3756 		return &bpf_sk_setsockopt_proto;
3757 	case BPF_FUNC_getsockopt:
3758 		return &bpf_sk_getsockopt_proto;
3759 	default:
3760 		return NULL;
3761 	}
3762 }
3763 
3764 static struct bpf_iter_reg tcp_reg_info = {
3765 	.target			= "tcp",
3766 	.ctx_arg_info_size	= 1,
3767 	.ctx_arg_info		= {
3768 		{ offsetof(struct bpf_iter__tcp, sk_common),
3769 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3770 	},
3771 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3772 	.seq_info		= &tcp_seq_info,
3773 };
3774 
3775 static void __init bpf_iter_register(void)
3776 {
3777 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3778 	if (bpf_iter_reg_target(&tcp_reg_info))
3779 		pr_warn("Warning: could not register bpf iterator tcp\n");
3780 }
3781 
3782 #endif
3783 
3784 void __init tcp_v4_init(void)
3785 {
3786 	int cpu, res;
3787 
3788 	for_each_possible_cpu(cpu) {
3789 		struct sock *sk;
3790 
3791 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3792 					   IPPROTO_TCP, &init_net);
3793 		if (res)
3794 			panic("Failed to create the TCP control socket.\n");
3795 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3796 
3797 		/* Please enforce IP_DF and IPID==0 for RST and
3798 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3799 		 */
3800 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3801 
3802 		sk->sk_clockid = CLOCK_MONOTONIC;
3803 
3804 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3805 	}
3806 	if (register_pernet_subsys(&tcp_sk_ops))
3807 		panic("Failed to create the TCP control socket.\n");
3808 
3809 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3810 	bpf_iter_register();
3811 #endif
3812 }
3813