xref: /linux/net/ipv4/tcp_ipv4.c (revision 3fd6c59042dbba50391e30862beac979491145fe)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74 
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 #include <linux/skbuff_ref.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 #include <trace/events/tcp.h>
88 
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93 
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96 
97 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
99 };
100 
101 static DEFINE_MUTEX(tcp_exit_batch_mutex);
102 
tcp_v4_init_seq(const struct sk_buff * skb)103 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
104 {
105 	return secure_tcp_seq(ip_hdr(skb)->daddr,
106 			      ip_hdr(skb)->saddr,
107 			      tcp_hdr(skb)->dest,
108 			      tcp_hdr(skb)->source);
109 }
110 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)111 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
112 {
113 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
114 }
115 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 {
118 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
119 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
120 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
121 	struct tcp_sock *tp = tcp_sk(sk);
122 	int ts_recent_stamp;
123 
124 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
125 		reuse = 0;
126 
127 	if (reuse == 2) {
128 		/* Still does not detect *everything* that goes through
129 		 * lo, since we require a loopback src or dst address
130 		 * or direct binding to 'lo' interface.
131 		 */
132 		bool loopback = false;
133 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
134 			loopback = true;
135 #if IS_ENABLED(CONFIG_IPV6)
136 		if (tw->tw_family == AF_INET6) {
137 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
138 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
139 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
140 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
141 				loopback = true;
142 		} else
143 #endif
144 		{
145 			if (ipv4_is_loopback(tw->tw_daddr) ||
146 			    ipv4_is_loopback(tw->tw_rcv_saddr))
147 				loopback = true;
148 		}
149 		if (!loopback)
150 			reuse = 0;
151 	}
152 
153 	/* With PAWS, it is safe from the viewpoint
154 	   of data integrity. Even without PAWS it is safe provided sequence
155 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
156 
157 	   Actually, the idea is close to VJ's one, only timestamp cache is
158 	   held not per host, but per port pair and TW bucket is used as state
159 	   holder.
160 
161 	   If TW bucket has been already destroyed we fall back to VJ's scheme
162 	   and use initial timestamp retrieved from peer table.
163 	 */
164 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
165 	if (ts_recent_stamp &&
166 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
167 					    ts_recent_stamp)))) {
168 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
169 		 * and releasing the bucket lock.
170 		 */
171 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
172 			return 0;
173 
174 		/* In case of repair and re-using TIME-WAIT sockets we still
175 		 * want to be sure that it is safe as above but honor the
176 		 * sequence numbers and time stamps set as part of the repair
177 		 * process.
178 		 *
179 		 * Without this check re-using a TIME-WAIT socket with TCP
180 		 * repair would accumulate a -1 on the repair assigned
181 		 * sequence number. The first time it is reused the sequence
182 		 * is -1, the second time -2, etc. This fixes that issue
183 		 * without appearing to create any others.
184 		 */
185 		if (likely(!tp->repair)) {
186 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
187 
188 			if (!seq)
189 				seq = 1;
190 			WRITE_ONCE(tp->write_seq, seq);
191 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
192 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
193 		}
194 
195 		return 1;
196 	}
197 
198 	return 0;
199 }
200 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
201 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)202 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
203 			      int addr_len)
204 {
205 	/* This check is replicated from tcp_v4_connect() and intended to
206 	 * prevent BPF program called below from accessing bytes that are out
207 	 * of the bound specified by user in addr_len.
208 	 */
209 	if (addr_len < sizeof(struct sockaddr_in))
210 		return -EINVAL;
211 
212 	sock_owned_by_me(sk);
213 
214 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
215 }
216 
217 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)218 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
219 {
220 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
221 	struct inet_timewait_death_row *tcp_death_row;
222 	struct inet_sock *inet = inet_sk(sk);
223 	struct tcp_sock *tp = tcp_sk(sk);
224 	struct ip_options_rcu *inet_opt;
225 	struct net *net = sock_net(sk);
226 	__be16 orig_sport, orig_dport;
227 	__be32 daddr, nexthop;
228 	struct flowi4 *fl4;
229 	struct rtable *rt;
230 	int err;
231 
232 	if (addr_len < sizeof(struct sockaddr_in))
233 		return -EINVAL;
234 
235 	if (usin->sin_family != AF_INET)
236 		return -EAFNOSUPPORT;
237 
238 	nexthop = daddr = usin->sin_addr.s_addr;
239 	inet_opt = rcu_dereference_protected(inet->inet_opt,
240 					     lockdep_sock_is_held(sk));
241 	if (inet_opt && inet_opt->opt.srr) {
242 		if (!daddr)
243 			return -EINVAL;
244 		nexthop = inet_opt->opt.faddr;
245 	}
246 
247 	orig_sport = inet->inet_sport;
248 	orig_dport = usin->sin_port;
249 	fl4 = &inet->cork.fl.u.ip4;
250 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
251 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
252 			      orig_dport, sk);
253 	if (IS_ERR(rt)) {
254 		err = PTR_ERR(rt);
255 		if (err == -ENETUNREACH)
256 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
257 		return err;
258 	}
259 
260 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
261 		ip_rt_put(rt);
262 		return -ENETUNREACH;
263 	}
264 
265 	if (!inet_opt || !inet_opt->opt.srr)
266 		daddr = fl4->daddr;
267 
268 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
269 
270 	if (!inet->inet_saddr) {
271 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
272 		if (err) {
273 			ip_rt_put(rt);
274 			return err;
275 		}
276 	} else {
277 		sk_rcv_saddr_set(sk, inet->inet_saddr);
278 	}
279 
280 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
281 		/* Reset inherited state */
282 		tp->rx_opt.ts_recent	   = 0;
283 		tp->rx_opt.ts_recent_stamp = 0;
284 		if (likely(!tp->repair))
285 			WRITE_ONCE(tp->write_seq, 0);
286 	}
287 
288 	inet->inet_dport = usin->sin_port;
289 	sk_daddr_set(sk, daddr);
290 
291 	inet_csk(sk)->icsk_ext_hdr_len = 0;
292 	if (inet_opt)
293 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
294 
295 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
296 
297 	/* Socket identity is still unknown (sport may be zero).
298 	 * However we set state to SYN-SENT and not releasing socket
299 	 * lock select source port, enter ourselves into the hash tables and
300 	 * complete initialization after this.
301 	 */
302 	tcp_set_state(sk, TCP_SYN_SENT);
303 	err = inet_hash_connect(tcp_death_row, sk);
304 	if (err)
305 		goto failure;
306 
307 	sk_set_txhash(sk);
308 
309 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
310 			       inet->inet_sport, inet->inet_dport, sk);
311 	if (IS_ERR(rt)) {
312 		err = PTR_ERR(rt);
313 		rt = NULL;
314 		goto failure;
315 	}
316 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
317 	/* OK, now commit destination to socket.  */
318 	sk->sk_gso_type = SKB_GSO_TCPV4;
319 	sk_setup_caps(sk, &rt->dst);
320 	rt = NULL;
321 
322 	if (likely(!tp->repair)) {
323 		if (!tp->write_seq)
324 			WRITE_ONCE(tp->write_seq,
325 				   secure_tcp_seq(inet->inet_saddr,
326 						  inet->inet_daddr,
327 						  inet->inet_sport,
328 						  usin->sin_port));
329 		WRITE_ONCE(tp->tsoffset,
330 			   secure_tcp_ts_off(net, inet->inet_saddr,
331 					     inet->inet_daddr));
332 	}
333 
334 	atomic_set(&inet->inet_id, get_random_u16());
335 
336 	if (tcp_fastopen_defer_connect(sk, &err))
337 		return err;
338 	if (err)
339 		goto failure;
340 
341 	err = tcp_connect(sk);
342 
343 	if (err)
344 		goto failure;
345 
346 	return 0;
347 
348 failure:
349 	/*
350 	 * This unhashes the socket and releases the local port,
351 	 * if necessary.
352 	 */
353 	tcp_set_state(sk, TCP_CLOSE);
354 	inet_bhash2_reset_saddr(sk);
355 	ip_rt_put(rt);
356 	sk->sk_route_caps = 0;
357 	inet->inet_dport = 0;
358 	return err;
359 }
360 EXPORT_SYMBOL(tcp_v4_connect);
361 
362 /*
363  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
364  * It can be called through tcp_release_cb() if socket was owned by user
365  * at the time tcp_v4_err() was called to handle ICMP message.
366  */
tcp_v4_mtu_reduced(struct sock * sk)367 void tcp_v4_mtu_reduced(struct sock *sk)
368 {
369 	struct inet_sock *inet = inet_sk(sk);
370 	struct dst_entry *dst;
371 	u32 mtu;
372 
373 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
374 		return;
375 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
376 	dst = inet_csk_update_pmtu(sk, mtu);
377 	if (!dst)
378 		return;
379 
380 	/* Something is about to be wrong... Remember soft error
381 	 * for the case, if this connection will not able to recover.
382 	 */
383 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
384 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
385 
386 	mtu = dst_mtu(dst);
387 
388 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
389 	    ip_sk_accept_pmtu(sk) &&
390 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
391 		tcp_sync_mss(sk, mtu);
392 
393 		/* Resend the TCP packet because it's
394 		 * clear that the old packet has been
395 		 * dropped. This is the new "fast" path mtu
396 		 * discovery.
397 		 */
398 		tcp_simple_retransmit(sk);
399 	} /* else let the usual retransmit timer handle it */
400 }
401 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
402 
do_redirect(struct sk_buff * skb,struct sock * sk)403 static void do_redirect(struct sk_buff *skb, struct sock *sk)
404 {
405 	struct dst_entry *dst = __sk_dst_check(sk, 0);
406 
407 	if (dst)
408 		dst->ops->redirect(dst, sk, skb);
409 }
410 
411 
412 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)413 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
414 {
415 	struct request_sock *req = inet_reqsk(sk);
416 	struct net *net = sock_net(sk);
417 
418 	/* ICMPs are not backlogged, hence we cannot get
419 	 * an established socket here.
420 	 */
421 	if (seq != tcp_rsk(req)->snt_isn) {
422 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
423 	} else if (abort) {
424 		/*
425 		 * Still in SYN_RECV, just remove it silently.
426 		 * There is no good way to pass the error to the newly
427 		 * created socket, and POSIX does not want network
428 		 * errors returned from accept().
429 		 */
430 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
431 		tcp_listendrop(req->rsk_listener);
432 	}
433 	reqsk_put(req);
434 }
435 EXPORT_SYMBOL(tcp_req_err);
436 
437 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)438 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
439 {
440 	struct inet_connection_sock *icsk = inet_csk(sk);
441 	struct tcp_sock *tp = tcp_sk(sk);
442 	struct sk_buff *skb;
443 	s32 remaining;
444 	u32 delta_us;
445 
446 	if (sock_owned_by_user(sk))
447 		return;
448 
449 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
450 	    !icsk->icsk_backoff)
451 		return;
452 
453 	skb = tcp_rtx_queue_head(sk);
454 	if (WARN_ON_ONCE(!skb))
455 		return;
456 
457 	icsk->icsk_backoff--;
458 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
459 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
460 
461 	tcp_mstamp_refresh(tp);
462 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
463 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
464 
465 	if (remaining > 0) {
466 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
467 					  remaining, TCP_RTO_MAX);
468 	} else {
469 		/* RTO revert clocked out retransmission.
470 		 * Will retransmit now.
471 		 */
472 		tcp_retransmit_timer(sk);
473 	}
474 }
475 EXPORT_SYMBOL(tcp_ld_RTO_revert);
476 
477 /*
478  * This routine is called by the ICMP module when it gets some
479  * sort of error condition.  If err < 0 then the socket should
480  * be closed and the error returned to the user.  If err > 0
481  * it's just the icmp type << 8 | icmp code.  After adjustment
482  * header points to the first 8 bytes of the tcp header.  We need
483  * to find the appropriate port.
484  *
485  * The locking strategy used here is very "optimistic". When
486  * someone else accesses the socket the ICMP is just dropped
487  * and for some paths there is no check at all.
488  * A more general error queue to queue errors for later handling
489  * is probably better.
490  *
491  */
492 
tcp_v4_err(struct sk_buff * skb,u32 info)493 int tcp_v4_err(struct sk_buff *skb, u32 info)
494 {
495 	const struct iphdr *iph = (const struct iphdr *)skb->data;
496 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
497 	struct tcp_sock *tp;
498 	const int type = icmp_hdr(skb)->type;
499 	const int code = icmp_hdr(skb)->code;
500 	struct sock *sk;
501 	struct request_sock *fastopen;
502 	u32 seq, snd_una;
503 	int err;
504 	struct net *net = dev_net(skb->dev);
505 
506 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
507 				       iph->daddr, th->dest, iph->saddr,
508 				       ntohs(th->source), inet_iif(skb), 0);
509 	if (!sk) {
510 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
511 		return -ENOENT;
512 	}
513 	if (sk->sk_state == TCP_TIME_WAIT) {
514 		/* To increase the counter of ignored icmps for TCP-AO */
515 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
516 		inet_twsk_put(inet_twsk(sk));
517 		return 0;
518 	}
519 	seq = ntohl(th->seq);
520 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
521 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
522 				     type == ICMP_TIME_EXCEEDED ||
523 				     (type == ICMP_DEST_UNREACH &&
524 				      (code == ICMP_NET_UNREACH ||
525 				       code == ICMP_HOST_UNREACH)));
526 		return 0;
527 	}
528 
529 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
530 		sock_put(sk);
531 		return 0;
532 	}
533 
534 	bh_lock_sock(sk);
535 	/* If too many ICMPs get dropped on busy
536 	 * servers this needs to be solved differently.
537 	 * We do take care of PMTU discovery (RFC1191) special case :
538 	 * we can receive locally generated ICMP messages while socket is held.
539 	 */
540 	if (sock_owned_by_user(sk)) {
541 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
542 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
543 	}
544 	if (sk->sk_state == TCP_CLOSE)
545 		goto out;
546 
547 	if (static_branch_unlikely(&ip4_min_ttl)) {
548 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
549 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
550 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
551 			goto out;
552 		}
553 	}
554 
555 	tp = tcp_sk(sk);
556 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
557 	fastopen = rcu_dereference(tp->fastopen_rsk);
558 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
559 	if (sk->sk_state != TCP_LISTEN &&
560 	    !between(seq, snd_una, tp->snd_nxt)) {
561 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
562 		goto out;
563 	}
564 
565 	switch (type) {
566 	case ICMP_REDIRECT:
567 		if (!sock_owned_by_user(sk))
568 			do_redirect(skb, sk);
569 		goto out;
570 	case ICMP_SOURCE_QUENCH:
571 		/* Just silently ignore these. */
572 		goto out;
573 	case ICMP_PARAMETERPROB:
574 		err = EPROTO;
575 		break;
576 	case ICMP_DEST_UNREACH:
577 		if (code > NR_ICMP_UNREACH)
578 			goto out;
579 
580 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
581 			/* We are not interested in TCP_LISTEN and open_requests
582 			 * (SYN-ACKs send out by Linux are always <576bytes so
583 			 * they should go through unfragmented).
584 			 */
585 			if (sk->sk_state == TCP_LISTEN)
586 				goto out;
587 
588 			WRITE_ONCE(tp->mtu_info, info);
589 			if (!sock_owned_by_user(sk)) {
590 				tcp_v4_mtu_reduced(sk);
591 			} else {
592 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
593 					sock_hold(sk);
594 			}
595 			goto out;
596 		}
597 
598 		err = icmp_err_convert[code].errno;
599 		/* check if this ICMP message allows revert of backoff.
600 		 * (see RFC 6069)
601 		 */
602 		if (!fastopen &&
603 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
604 			tcp_ld_RTO_revert(sk, seq);
605 		break;
606 	case ICMP_TIME_EXCEEDED:
607 		err = EHOSTUNREACH;
608 		break;
609 	default:
610 		goto out;
611 	}
612 
613 	switch (sk->sk_state) {
614 	case TCP_SYN_SENT:
615 	case TCP_SYN_RECV:
616 		/* Only in fast or simultaneous open. If a fast open socket is
617 		 * already accepted it is treated as a connected one below.
618 		 */
619 		if (fastopen && !fastopen->sk)
620 			break;
621 
622 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
623 
624 		if (!sock_owned_by_user(sk))
625 			tcp_done_with_error(sk, err);
626 		else
627 			WRITE_ONCE(sk->sk_err_soft, err);
628 		goto out;
629 	}
630 
631 	/* If we've already connected we will keep trying
632 	 * until we time out, or the user gives up.
633 	 *
634 	 * rfc1122 4.2.3.9 allows to consider as hard errors
635 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
636 	 * but it is obsoleted by pmtu discovery).
637 	 *
638 	 * Note, that in modern internet, where routing is unreliable
639 	 * and in each dark corner broken firewalls sit, sending random
640 	 * errors ordered by their masters even this two messages finally lose
641 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
642 	 *
643 	 * Now we are in compliance with RFCs.
644 	 *							--ANK (980905)
645 	 */
646 
647 	if (!sock_owned_by_user(sk) &&
648 	    inet_test_bit(RECVERR, sk)) {
649 		WRITE_ONCE(sk->sk_err, err);
650 		sk_error_report(sk);
651 	} else	{ /* Only an error on timeout */
652 		WRITE_ONCE(sk->sk_err_soft, err);
653 	}
654 
655 out:
656 	bh_unlock_sock(sk);
657 	sock_put(sk);
658 	return 0;
659 }
660 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)661 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
662 {
663 	struct tcphdr *th = tcp_hdr(skb);
664 
665 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
666 	skb->csum_start = skb_transport_header(skb) - skb->head;
667 	skb->csum_offset = offsetof(struct tcphdr, check);
668 }
669 
670 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)671 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
672 {
673 	const struct inet_sock *inet = inet_sk(sk);
674 
675 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
676 }
677 EXPORT_SYMBOL(tcp_v4_send_check);
678 
679 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
680 
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])681 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
682 				 const struct tcp_ao_hdr *aoh,
683 				 struct ip_reply_arg *arg, struct tcphdr *reply,
684 				 __be32 reply_options[REPLY_OPTIONS_LEN])
685 {
686 #ifdef CONFIG_TCP_AO
687 	int sdif = tcp_v4_sdif(skb);
688 	int dif = inet_iif(skb);
689 	int l3index = sdif ? dif : 0;
690 	bool allocated_traffic_key;
691 	struct tcp_ao_key *key;
692 	char *traffic_key;
693 	bool drop = true;
694 	u32 ao_sne = 0;
695 	u8 keyid;
696 
697 	rcu_read_lock();
698 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
699 				 &key, &traffic_key, &allocated_traffic_key,
700 				 &keyid, &ao_sne))
701 		goto out;
702 
703 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
704 				 (aoh->rnext_keyid << 8) | keyid);
705 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
706 	reply->doff = arg->iov[0].iov_len / 4;
707 
708 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
709 			    key, traffic_key,
710 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
711 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
712 			    reply, ao_sne))
713 		goto out;
714 	drop = false;
715 out:
716 	rcu_read_unlock();
717 	if (allocated_traffic_key)
718 		kfree(traffic_key);
719 	return drop;
720 #else
721 	return true;
722 #endif
723 }
724 
725 /*
726  *	This routine will send an RST to the other tcp.
727  *
728  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
729  *		      for reset.
730  *	Answer: if a packet caused RST, it is not for a socket
731  *		existing in our system, if it is matched to a socket,
732  *		it is just duplicate segment or bug in other side's TCP.
733  *		So that we build reply only basing on parameters
734  *		arrived with segment.
735  *	Exception: precedence violation. We do not implement it in any case.
736  */
737 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)738 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
739 			      enum sk_rst_reason reason)
740 {
741 	const struct tcphdr *th = tcp_hdr(skb);
742 	struct {
743 		struct tcphdr th;
744 		__be32 opt[REPLY_OPTIONS_LEN];
745 	} rep;
746 	const __u8 *md5_hash_location = NULL;
747 	const struct tcp_ao_hdr *aoh;
748 	struct ip_reply_arg arg;
749 #ifdef CONFIG_TCP_MD5SIG
750 	struct tcp_md5sig_key *key = NULL;
751 	unsigned char newhash[16];
752 	struct sock *sk1 = NULL;
753 	int genhash;
754 #endif
755 	u64 transmit_time = 0;
756 	struct sock *ctl_sk;
757 	struct net *net;
758 	u32 txhash = 0;
759 
760 	/* Never send a reset in response to a reset. */
761 	if (th->rst)
762 		return;
763 
764 	/* If sk not NULL, it means we did a successful lookup and incoming
765 	 * route had to be correct. prequeue might have dropped our dst.
766 	 */
767 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
768 		return;
769 
770 	/* Swap the send and the receive. */
771 	memset(&rep, 0, sizeof(rep));
772 	rep.th.dest   = th->source;
773 	rep.th.source = th->dest;
774 	rep.th.doff   = sizeof(struct tcphdr) / 4;
775 	rep.th.rst    = 1;
776 
777 	if (th->ack) {
778 		rep.th.seq = th->ack_seq;
779 	} else {
780 		rep.th.ack = 1;
781 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
782 				       skb->len - (th->doff << 2));
783 	}
784 
785 	memset(&arg, 0, sizeof(arg));
786 	arg.iov[0].iov_base = (unsigned char *)&rep;
787 	arg.iov[0].iov_len  = sizeof(rep.th);
788 
789 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
790 
791 	/* Invalid TCP option size or twice included auth */
792 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
793 		return;
794 
795 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
796 		return;
797 
798 #ifdef CONFIG_TCP_MD5SIG
799 	rcu_read_lock();
800 	if (sk && sk_fullsock(sk)) {
801 		const union tcp_md5_addr *addr;
802 		int l3index;
803 
804 		/* sdif set, means packet ingressed via a device
805 		 * in an L3 domain and inet_iif is set to it.
806 		 */
807 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
808 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
809 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
810 	} else if (md5_hash_location) {
811 		const union tcp_md5_addr *addr;
812 		int sdif = tcp_v4_sdif(skb);
813 		int dif = inet_iif(skb);
814 		int l3index;
815 
816 		/*
817 		 * active side is lost. Try to find listening socket through
818 		 * source port, and then find md5 key through listening socket.
819 		 * we are not loose security here:
820 		 * Incoming packet is checked with md5 hash with finding key,
821 		 * no RST generated if md5 hash doesn't match.
822 		 */
823 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
824 					     NULL, 0, ip_hdr(skb)->saddr,
825 					     th->source, ip_hdr(skb)->daddr,
826 					     ntohs(th->source), dif, sdif);
827 		/* don't send rst if it can't find key */
828 		if (!sk1)
829 			goto out;
830 
831 		/* sdif set, means packet ingressed via a device
832 		 * in an L3 domain and dif is set to it.
833 		 */
834 		l3index = sdif ? dif : 0;
835 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
836 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
837 		if (!key)
838 			goto out;
839 
840 
841 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
842 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
843 			goto out;
844 
845 	}
846 
847 	if (key) {
848 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
849 				   (TCPOPT_NOP << 16) |
850 				   (TCPOPT_MD5SIG << 8) |
851 				   TCPOLEN_MD5SIG);
852 		/* Update length and the length the header thinks exists */
853 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
854 		rep.th.doff = arg.iov[0].iov_len / 4;
855 
856 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
857 				     key, ip_hdr(skb)->saddr,
858 				     ip_hdr(skb)->daddr, &rep.th);
859 	}
860 #endif
861 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
862 	if (rep.opt[0] == 0) {
863 		__be32 mrst = mptcp_reset_option(skb);
864 
865 		if (mrst) {
866 			rep.opt[0] = mrst;
867 			arg.iov[0].iov_len += sizeof(mrst);
868 			rep.th.doff = arg.iov[0].iov_len / 4;
869 		}
870 	}
871 
872 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
873 				      ip_hdr(skb)->saddr, /* XXX */
874 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
875 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
876 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
877 
878 	/* When socket is gone, all binding information is lost.
879 	 * routing might fail in this case. No choice here, if we choose to force
880 	 * input interface, we will misroute in case of asymmetric route.
881 	 */
882 	if (sk)
883 		arg.bound_dev_if = sk->sk_bound_dev_if;
884 
885 	trace_tcp_send_reset(sk, skb, reason);
886 
887 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
888 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
889 
890 	arg.tos = ip_hdr(skb)->tos;
891 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
894 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
895 
896 	sock_net_set(ctl_sk, net);
897 	if (sk) {
898 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
899 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
900 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
901 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
902 		transmit_time = tcp_transmit_time(sk);
903 		xfrm_sk_clone_policy(ctl_sk, sk);
904 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
905 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
906 	} else {
907 		ctl_sk->sk_mark = 0;
908 		ctl_sk->sk_priority = 0;
909 	}
910 	ip_send_unicast_reply(ctl_sk, sk,
911 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
912 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913 			      &arg, arg.iov[0].iov_len,
914 			      transmit_time, txhash);
915 
916 	xfrm_sk_free_policy(ctl_sk);
917 	sock_net_set(ctl_sk, &init_net);
918 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
919 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
920 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
921 	local_bh_enable();
922 
923 #ifdef CONFIG_TCP_MD5SIG
924 out:
925 	rcu_read_unlock();
926 #endif
927 }
928 
929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
930    outside socket context is ugly, certainly. What can I do?
931  */
932 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)933 static void tcp_v4_send_ack(const struct sock *sk,
934 			    struct sk_buff *skb, u32 seq, u32 ack,
935 			    u32 win, u32 tsval, u32 tsecr, int oif,
936 			    struct tcp_key *key,
937 			    int reply_flags, u8 tos, u32 txhash)
938 {
939 	const struct tcphdr *th = tcp_hdr(skb);
940 	struct {
941 		struct tcphdr th;
942 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
943 	} rep;
944 	struct net *net = sock_net(sk);
945 	struct ip_reply_arg arg;
946 	struct sock *ctl_sk;
947 	u64 transmit_time;
948 
949 	memset(&rep.th, 0, sizeof(struct tcphdr));
950 	memset(&arg, 0, sizeof(arg));
951 
952 	arg.iov[0].iov_base = (unsigned char *)&rep;
953 	arg.iov[0].iov_len  = sizeof(rep.th);
954 	if (tsecr) {
955 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
956 				   (TCPOPT_TIMESTAMP << 8) |
957 				   TCPOLEN_TIMESTAMP);
958 		rep.opt[1] = htonl(tsval);
959 		rep.opt[2] = htonl(tsecr);
960 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
961 	}
962 
963 	/* Swap the send and the receive. */
964 	rep.th.dest    = th->source;
965 	rep.th.source  = th->dest;
966 	rep.th.doff    = arg.iov[0].iov_len / 4;
967 	rep.th.seq     = htonl(seq);
968 	rep.th.ack_seq = htonl(ack);
969 	rep.th.ack     = 1;
970 	rep.th.window  = htons(win);
971 
972 #ifdef CONFIG_TCP_MD5SIG
973 	if (tcp_key_is_md5(key)) {
974 		int offset = (tsecr) ? 3 : 0;
975 
976 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
977 					  (TCPOPT_NOP << 16) |
978 					  (TCPOPT_MD5SIG << 8) |
979 					  TCPOLEN_MD5SIG);
980 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
981 		rep.th.doff = arg.iov[0].iov_len/4;
982 
983 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
984 				    key->md5_key, ip_hdr(skb)->saddr,
985 				    ip_hdr(skb)->daddr, &rep.th);
986 	}
987 #endif
988 #ifdef CONFIG_TCP_AO
989 	if (tcp_key_is_ao(key)) {
990 		int offset = (tsecr) ? 3 : 0;
991 
992 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
993 					  (tcp_ao_len(key->ao_key) << 16) |
994 					  (key->ao_key->sndid << 8) |
995 					  key->rcv_next);
996 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
997 		rep.th.doff = arg.iov[0].iov_len / 4;
998 
999 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1000 				key->ao_key, key->traffic_key,
1001 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1002 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1003 				&rep.th, key->sne);
1004 	}
1005 #endif
1006 	arg.flags = reply_flags;
1007 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1008 				      ip_hdr(skb)->saddr, /* XXX */
1009 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1010 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1011 	if (oif)
1012 		arg.bound_dev_if = oif;
1013 	arg.tos = tos;
1014 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1015 	local_bh_disable();
1016 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1017 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1018 	sock_net_set(ctl_sk, net);
1019 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1020 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1021 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1022 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1023 	transmit_time = tcp_transmit_time(sk);
1024 	ip_send_unicast_reply(ctl_sk, sk,
1025 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1026 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1027 			      &arg, arg.iov[0].iov_len,
1028 			      transmit_time, txhash);
1029 
1030 	sock_net_set(ctl_sk, &init_net);
1031 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1032 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1033 	local_bh_enable();
1034 }
1035 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1037 {
1038 	struct inet_timewait_sock *tw = inet_twsk(sk);
1039 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1040 	struct tcp_key key = {};
1041 #ifdef CONFIG_TCP_AO
1042 	struct tcp_ao_info *ao_info;
1043 
1044 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1045 		/* FIXME: the segment to-be-acked is not verified yet */
1046 		ao_info = rcu_dereference(tcptw->ao_info);
1047 		if (ao_info) {
1048 			const struct tcp_ao_hdr *aoh;
1049 
1050 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1051 				inet_twsk_put(tw);
1052 				return;
1053 			}
1054 
1055 			if (aoh)
1056 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1057 								    aoh->rnext_keyid, -1);
1058 		}
1059 	}
1060 	if (key.ao_key) {
1061 		struct tcp_ao_key *rnext_key;
1062 
1063 		key.traffic_key = snd_other_key(key.ao_key);
1064 		key.sne = READ_ONCE(ao_info->snd_sne);
1065 		rnext_key = READ_ONCE(ao_info->rnext_key);
1066 		key.rcv_next = rnext_key->rcvid;
1067 		key.type = TCP_KEY_AO;
1068 #else
1069 	if (0) {
1070 #endif
1071 	} else if (static_branch_tcp_md5()) {
1072 		key.md5_key = tcp_twsk_md5_key(tcptw);
1073 		if (key.md5_key)
1074 			key.type = TCP_KEY_MD5;
1075 	}
1076 
1077 	tcp_v4_send_ack(sk, skb,
1078 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1079 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1080 			tcp_tw_tsval(tcptw),
1081 			READ_ONCE(tcptw->tw_ts_recent),
1082 			tw->tw_bound_dev_if, &key,
1083 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1084 			tw->tw_tos,
1085 			tw->tw_txhash);
1086 
1087 	inet_twsk_put(tw);
1088 }
1089 
1090 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1091 				  struct request_sock *req)
1092 {
1093 	struct tcp_key key = {};
1094 
1095 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1096 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1097 	 */
1098 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1099 					     tcp_sk(sk)->snd_nxt;
1100 
1101 #ifdef CONFIG_TCP_AO
1102 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1103 	    tcp_rsk_used_ao(req)) {
1104 		const union tcp_md5_addr *addr;
1105 		const struct tcp_ao_hdr *aoh;
1106 		int l3index;
1107 
1108 		/* Invalid TCP option size or twice included auth */
1109 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1110 			return;
1111 		if (!aoh)
1112 			return;
1113 
1114 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1115 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1116 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1117 					      aoh->rnext_keyid, -1);
1118 		if (unlikely(!key.ao_key)) {
1119 			/* Send ACK with any matching MKT for the peer */
1120 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1121 			/* Matching key disappeared (user removed the key?)
1122 			 * let the handshake timeout.
1123 			 */
1124 			if (!key.ao_key) {
1125 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1126 						     addr,
1127 						     ntohs(tcp_hdr(skb)->source),
1128 						     &ip_hdr(skb)->daddr,
1129 						     ntohs(tcp_hdr(skb)->dest));
1130 				return;
1131 			}
1132 		}
1133 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1134 		if (!key.traffic_key)
1135 			return;
1136 
1137 		key.type = TCP_KEY_AO;
1138 		key.rcv_next = aoh->keyid;
1139 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1140 #else
1141 	if (0) {
1142 #endif
1143 	} else if (static_branch_tcp_md5()) {
1144 		const union tcp_md5_addr *addr;
1145 		int l3index;
1146 
1147 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1148 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1149 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1150 		if (key.md5_key)
1151 			key.type = TCP_KEY_MD5;
1152 	}
1153 
1154 	tcp_v4_send_ack(sk, skb, seq,
1155 			tcp_rsk(req)->rcv_nxt,
1156 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1157 			tcp_rsk_tsval(tcp_rsk(req)),
1158 			READ_ONCE(req->ts_recent),
1159 			0, &key,
1160 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1161 			ip_hdr(skb)->tos,
1162 			READ_ONCE(tcp_rsk(req)->txhash));
1163 	if (tcp_key_is_ao(&key))
1164 		kfree(key.traffic_key);
1165 }
1166 
1167 /*
1168  *	Send a SYN-ACK after having received a SYN.
1169  *	This still operates on a request_sock only, not on a big
1170  *	socket.
1171  */
1172 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1173 			      struct flowi *fl,
1174 			      struct request_sock *req,
1175 			      struct tcp_fastopen_cookie *foc,
1176 			      enum tcp_synack_type synack_type,
1177 			      struct sk_buff *syn_skb)
1178 {
1179 	const struct inet_request_sock *ireq = inet_rsk(req);
1180 	struct flowi4 fl4;
1181 	int err = -1;
1182 	struct sk_buff *skb;
1183 	u8 tos;
1184 
1185 	/* First, grab a route. */
1186 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1187 		return -1;
1188 
1189 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1190 
1191 	if (skb) {
1192 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1193 
1194 		tos = READ_ONCE(inet_sk(sk)->tos);
1195 
1196 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1197 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1198 			      (tos & INET_ECN_MASK);
1199 
1200 		if (!INET_ECN_is_capable(tos) &&
1201 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1202 			tos |= INET_ECN_ECT_0;
1203 
1204 		rcu_read_lock();
1205 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1206 					    ireq->ir_rmt_addr,
1207 					    rcu_dereference(ireq->ireq_opt),
1208 					    tos);
1209 		rcu_read_unlock();
1210 		err = net_xmit_eval(err);
1211 	}
1212 
1213 	return err;
1214 }
1215 
1216 /*
1217  *	IPv4 request_sock destructor.
1218  */
1219 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1220 {
1221 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1222 }
1223 
1224 #ifdef CONFIG_TCP_MD5SIG
1225 /*
1226  * RFC2385 MD5 checksumming requires a mapping of
1227  * IP address->MD5 Key.
1228  * We need to maintain these in the sk structure.
1229  */
1230 
1231 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1232 EXPORT_SYMBOL(tcp_md5_needed);
1233 
1234 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1235 {
1236 	if (!old)
1237 		return true;
1238 
1239 	/* l3index always overrides non-l3index */
1240 	if (old->l3index && new->l3index == 0)
1241 		return false;
1242 	if (old->l3index == 0 && new->l3index)
1243 		return true;
1244 
1245 	return old->prefixlen < new->prefixlen;
1246 }
1247 
1248 /* Find the Key structure for an address.  */
1249 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1250 					   const union tcp_md5_addr *addr,
1251 					   int family, bool any_l3index)
1252 {
1253 	const struct tcp_sock *tp = tcp_sk(sk);
1254 	struct tcp_md5sig_key *key;
1255 	const struct tcp_md5sig_info *md5sig;
1256 	__be32 mask;
1257 	struct tcp_md5sig_key *best_match = NULL;
1258 	bool match;
1259 
1260 	/* caller either holds rcu_read_lock() or socket lock */
1261 	md5sig = rcu_dereference_check(tp->md5sig_info,
1262 				       lockdep_sock_is_held(sk));
1263 	if (!md5sig)
1264 		return NULL;
1265 
1266 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1267 				 lockdep_sock_is_held(sk)) {
1268 		if (key->family != family)
1269 			continue;
1270 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1271 		    key->l3index != l3index)
1272 			continue;
1273 		if (family == AF_INET) {
1274 			mask = inet_make_mask(key->prefixlen);
1275 			match = (key->addr.a4.s_addr & mask) ==
1276 				(addr->a4.s_addr & mask);
1277 #if IS_ENABLED(CONFIG_IPV6)
1278 		} else if (family == AF_INET6) {
1279 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1280 						  key->prefixlen);
1281 #endif
1282 		} else {
1283 			match = false;
1284 		}
1285 
1286 		if (match && better_md5_match(best_match, key))
1287 			best_match = key;
1288 	}
1289 	return best_match;
1290 }
1291 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1292 
1293 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1294 						      const union tcp_md5_addr *addr,
1295 						      int family, u8 prefixlen,
1296 						      int l3index, u8 flags)
1297 {
1298 	const struct tcp_sock *tp = tcp_sk(sk);
1299 	struct tcp_md5sig_key *key;
1300 	unsigned int size = sizeof(struct in_addr);
1301 	const struct tcp_md5sig_info *md5sig;
1302 
1303 	/* caller either holds rcu_read_lock() or socket lock */
1304 	md5sig = rcu_dereference_check(tp->md5sig_info,
1305 				       lockdep_sock_is_held(sk));
1306 	if (!md5sig)
1307 		return NULL;
1308 #if IS_ENABLED(CONFIG_IPV6)
1309 	if (family == AF_INET6)
1310 		size = sizeof(struct in6_addr);
1311 #endif
1312 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1313 				 lockdep_sock_is_held(sk)) {
1314 		if (key->family != family)
1315 			continue;
1316 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1317 			continue;
1318 		if (key->l3index != l3index)
1319 			continue;
1320 		if (!memcmp(&key->addr, addr, size) &&
1321 		    key->prefixlen == prefixlen)
1322 			return key;
1323 	}
1324 	return NULL;
1325 }
1326 
1327 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1328 					 const struct sock *addr_sk)
1329 {
1330 	const union tcp_md5_addr *addr;
1331 	int l3index;
1332 
1333 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1334 						 addr_sk->sk_bound_dev_if);
1335 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1336 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1337 }
1338 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1339 
1340 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1341 {
1342 	struct tcp_sock *tp = tcp_sk(sk);
1343 	struct tcp_md5sig_info *md5sig;
1344 
1345 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1346 	if (!md5sig)
1347 		return -ENOMEM;
1348 
1349 	sk_gso_disable(sk);
1350 	INIT_HLIST_HEAD(&md5sig->head);
1351 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1352 	return 0;
1353 }
1354 
1355 /* This can be called on a newly created socket, from other files */
1356 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1357 			    int family, u8 prefixlen, int l3index, u8 flags,
1358 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1359 {
1360 	/* Add Key to the list */
1361 	struct tcp_md5sig_key *key;
1362 	struct tcp_sock *tp = tcp_sk(sk);
1363 	struct tcp_md5sig_info *md5sig;
1364 
1365 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1366 	if (key) {
1367 		/* Pre-existing entry - just update that one.
1368 		 * Note that the key might be used concurrently.
1369 		 * data_race() is telling kcsan that we do not care of
1370 		 * key mismatches, since changing MD5 key on live flows
1371 		 * can lead to packet drops.
1372 		 */
1373 		data_race(memcpy(key->key, newkey, newkeylen));
1374 
1375 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1376 		 * Also note that a reader could catch new key->keylen value
1377 		 * but old key->key[], this is the reason we use __GFP_ZERO
1378 		 * at sock_kmalloc() time below these lines.
1379 		 */
1380 		WRITE_ONCE(key->keylen, newkeylen);
1381 
1382 		return 0;
1383 	}
1384 
1385 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1386 					   lockdep_sock_is_held(sk));
1387 
1388 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1389 	if (!key)
1390 		return -ENOMEM;
1391 
1392 	memcpy(key->key, newkey, newkeylen);
1393 	key->keylen = newkeylen;
1394 	key->family = family;
1395 	key->prefixlen = prefixlen;
1396 	key->l3index = l3index;
1397 	key->flags = flags;
1398 	memcpy(&key->addr, addr,
1399 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1400 								 sizeof(struct in_addr));
1401 	hlist_add_head_rcu(&key->node, &md5sig->head);
1402 	return 0;
1403 }
1404 
1405 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1406 		   int family, u8 prefixlen, int l3index, u8 flags,
1407 		   const u8 *newkey, u8 newkeylen)
1408 {
1409 	struct tcp_sock *tp = tcp_sk(sk);
1410 
1411 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1412 		if (tcp_md5_alloc_sigpool())
1413 			return -ENOMEM;
1414 
1415 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1416 			tcp_md5_release_sigpool();
1417 			return -ENOMEM;
1418 		}
1419 
1420 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1421 			struct tcp_md5sig_info *md5sig;
1422 
1423 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1424 			rcu_assign_pointer(tp->md5sig_info, NULL);
1425 			kfree_rcu(md5sig, rcu);
1426 			tcp_md5_release_sigpool();
1427 			return -EUSERS;
1428 		}
1429 	}
1430 
1431 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1432 				newkey, newkeylen, GFP_KERNEL);
1433 }
1434 EXPORT_SYMBOL(tcp_md5_do_add);
1435 
1436 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1437 		     int family, u8 prefixlen, int l3index,
1438 		     struct tcp_md5sig_key *key)
1439 {
1440 	struct tcp_sock *tp = tcp_sk(sk);
1441 
1442 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1443 		tcp_md5_add_sigpool();
1444 
1445 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1446 			tcp_md5_release_sigpool();
1447 			return -ENOMEM;
1448 		}
1449 
1450 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1451 			struct tcp_md5sig_info *md5sig;
1452 
1453 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1454 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1455 			rcu_assign_pointer(tp->md5sig_info, NULL);
1456 			kfree_rcu(md5sig, rcu);
1457 			tcp_md5_release_sigpool();
1458 			return -EUSERS;
1459 		}
1460 	}
1461 
1462 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1463 				key->flags, key->key, key->keylen,
1464 				sk_gfp_mask(sk, GFP_ATOMIC));
1465 }
1466 EXPORT_SYMBOL(tcp_md5_key_copy);
1467 
1468 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1469 		   u8 prefixlen, int l3index, u8 flags)
1470 {
1471 	struct tcp_md5sig_key *key;
1472 
1473 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1474 	if (!key)
1475 		return -ENOENT;
1476 	hlist_del_rcu(&key->node);
1477 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1478 	kfree_rcu(key, rcu);
1479 	return 0;
1480 }
1481 EXPORT_SYMBOL(tcp_md5_do_del);
1482 
1483 void tcp_clear_md5_list(struct sock *sk)
1484 {
1485 	struct tcp_sock *tp = tcp_sk(sk);
1486 	struct tcp_md5sig_key *key;
1487 	struct hlist_node *n;
1488 	struct tcp_md5sig_info *md5sig;
1489 
1490 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1491 
1492 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1493 		hlist_del_rcu(&key->node);
1494 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1495 		kfree_rcu(key, rcu);
1496 	}
1497 }
1498 
1499 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1500 				 sockptr_t optval, int optlen)
1501 {
1502 	struct tcp_md5sig cmd;
1503 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1504 	const union tcp_md5_addr *addr;
1505 	u8 prefixlen = 32;
1506 	int l3index = 0;
1507 	bool l3flag;
1508 	u8 flags;
1509 
1510 	if (optlen < sizeof(cmd))
1511 		return -EINVAL;
1512 
1513 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1514 		return -EFAULT;
1515 
1516 	if (sin->sin_family != AF_INET)
1517 		return -EINVAL;
1518 
1519 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1520 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1521 
1522 	if (optname == TCP_MD5SIG_EXT &&
1523 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1524 		prefixlen = cmd.tcpm_prefixlen;
1525 		if (prefixlen > 32)
1526 			return -EINVAL;
1527 	}
1528 
1529 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1530 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1531 		struct net_device *dev;
1532 
1533 		rcu_read_lock();
1534 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1535 		if (dev && netif_is_l3_master(dev))
1536 			l3index = dev->ifindex;
1537 
1538 		rcu_read_unlock();
1539 
1540 		/* ok to reference set/not set outside of rcu;
1541 		 * right now device MUST be an L3 master
1542 		 */
1543 		if (!dev || !l3index)
1544 			return -EINVAL;
1545 	}
1546 
1547 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1548 
1549 	if (!cmd.tcpm_keylen)
1550 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1551 
1552 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1553 		return -EINVAL;
1554 
1555 	/* Don't allow keys for peers that have a matching TCP-AO key.
1556 	 * See the comment in tcp_ao_add_cmd()
1557 	 */
1558 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1559 		return -EKEYREJECTED;
1560 
1561 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1562 			      cmd.tcpm_key, cmd.tcpm_keylen);
1563 }
1564 
1565 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1566 				   __be32 daddr, __be32 saddr,
1567 				   const struct tcphdr *th, int nbytes)
1568 {
1569 	struct tcp4_pseudohdr *bp;
1570 	struct scatterlist sg;
1571 	struct tcphdr *_th;
1572 
1573 	bp = hp->scratch;
1574 	bp->saddr = saddr;
1575 	bp->daddr = daddr;
1576 	bp->pad = 0;
1577 	bp->protocol = IPPROTO_TCP;
1578 	bp->len = cpu_to_be16(nbytes);
1579 
1580 	_th = (struct tcphdr *)(bp + 1);
1581 	memcpy(_th, th, sizeof(*th));
1582 	_th->check = 0;
1583 
1584 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1585 	ahash_request_set_crypt(hp->req, &sg, NULL,
1586 				sizeof(*bp) + sizeof(*th));
1587 	return crypto_ahash_update(hp->req);
1588 }
1589 
1590 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1591 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1592 {
1593 	struct tcp_sigpool hp;
1594 
1595 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1596 		goto clear_hash_nostart;
1597 
1598 	if (crypto_ahash_init(hp.req))
1599 		goto clear_hash;
1600 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1601 		goto clear_hash;
1602 	if (tcp_md5_hash_key(&hp, key))
1603 		goto clear_hash;
1604 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1605 	if (crypto_ahash_final(hp.req))
1606 		goto clear_hash;
1607 
1608 	tcp_sigpool_end(&hp);
1609 	return 0;
1610 
1611 clear_hash:
1612 	tcp_sigpool_end(&hp);
1613 clear_hash_nostart:
1614 	memset(md5_hash, 0, 16);
1615 	return 1;
1616 }
1617 
1618 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1619 			const struct sock *sk,
1620 			const struct sk_buff *skb)
1621 {
1622 	const struct tcphdr *th = tcp_hdr(skb);
1623 	struct tcp_sigpool hp;
1624 	__be32 saddr, daddr;
1625 
1626 	if (sk) { /* valid for establish/request sockets */
1627 		saddr = sk->sk_rcv_saddr;
1628 		daddr = sk->sk_daddr;
1629 	} else {
1630 		const struct iphdr *iph = ip_hdr(skb);
1631 		saddr = iph->saddr;
1632 		daddr = iph->daddr;
1633 	}
1634 
1635 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1636 		goto clear_hash_nostart;
1637 
1638 	if (crypto_ahash_init(hp.req))
1639 		goto clear_hash;
1640 
1641 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1642 		goto clear_hash;
1643 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1644 		goto clear_hash;
1645 	if (tcp_md5_hash_key(&hp, key))
1646 		goto clear_hash;
1647 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1648 	if (crypto_ahash_final(hp.req))
1649 		goto clear_hash;
1650 
1651 	tcp_sigpool_end(&hp);
1652 	return 0;
1653 
1654 clear_hash:
1655 	tcp_sigpool_end(&hp);
1656 clear_hash_nostart:
1657 	memset(md5_hash, 0, 16);
1658 	return 1;
1659 }
1660 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1661 
1662 #endif
1663 
1664 static void tcp_v4_init_req(struct request_sock *req,
1665 			    const struct sock *sk_listener,
1666 			    struct sk_buff *skb)
1667 {
1668 	struct inet_request_sock *ireq = inet_rsk(req);
1669 	struct net *net = sock_net(sk_listener);
1670 
1671 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1672 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1673 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1674 }
1675 
1676 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1677 					  struct sk_buff *skb,
1678 					  struct flowi *fl,
1679 					  struct request_sock *req,
1680 					  u32 tw_isn)
1681 {
1682 	tcp_v4_init_req(req, sk, skb);
1683 
1684 	if (security_inet_conn_request(sk, skb, req))
1685 		return NULL;
1686 
1687 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1688 }
1689 
1690 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1691 	.family		=	PF_INET,
1692 	.obj_size	=	sizeof(struct tcp_request_sock),
1693 	.rtx_syn_ack	=	tcp_rtx_synack,
1694 	.send_ack	=	tcp_v4_reqsk_send_ack,
1695 	.destructor	=	tcp_v4_reqsk_destructor,
1696 	.send_reset	=	tcp_v4_send_reset,
1697 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1698 };
1699 
1700 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1701 	.mss_clamp	=	TCP_MSS_DEFAULT,
1702 #ifdef CONFIG_TCP_MD5SIG
1703 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1704 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1705 #endif
1706 #ifdef CONFIG_TCP_AO
1707 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1708 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1709 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1710 #endif
1711 #ifdef CONFIG_SYN_COOKIES
1712 	.cookie_init_seq =	cookie_v4_init_sequence,
1713 #endif
1714 	.route_req	=	tcp_v4_route_req,
1715 	.init_seq	=	tcp_v4_init_seq,
1716 	.init_ts_off	=	tcp_v4_init_ts_off,
1717 	.send_synack	=	tcp_v4_send_synack,
1718 };
1719 
1720 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1721 {
1722 	/* Never answer to SYNs send to broadcast or multicast */
1723 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1724 		goto drop;
1725 
1726 	return tcp_conn_request(&tcp_request_sock_ops,
1727 				&tcp_request_sock_ipv4_ops, sk, skb);
1728 
1729 drop:
1730 	tcp_listendrop(sk);
1731 	return 0;
1732 }
1733 EXPORT_SYMBOL(tcp_v4_conn_request);
1734 
1735 
1736 /*
1737  * The three way handshake has completed - we got a valid synack -
1738  * now create the new socket.
1739  */
1740 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1741 				  struct request_sock *req,
1742 				  struct dst_entry *dst,
1743 				  struct request_sock *req_unhash,
1744 				  bool *own_req)
1745 {
1746 	struct inet_request_sock *ireq;
1747 	bool found_dup_sk = false;
1748 	struct inet_sock *newinet;
1749 	struct tcp_sock *newtp;
1750 	struct sock *newsk;
1751 #ifdef CONFIG_TCP_MD5SIG
1752 	const union tcp_md5_addr *addr;
1753 	struct tcp_md5sig_key *key;
1754 	int l3index;
1755 #endif
1756 	struct ip_options_rcu *inet_opt;
1757 
1758 	if (sk_acceptq_is_full(sk))
1759 		goto exit_overflow;
1760 
1761 	newsk = tcp_create_openreq_child(sk, req, skb);
1762 	if (!newsk)
1763 		goto exit_nonewsk;
1764 
1765 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1766 	inet_sk_rx_dst_set(newsk, skb);
1767 
1768 	newtp		      = tcp_sk(newsk);
1769 	newinet		      = inet_sk(newsk);
1770 	ireq		      = inet_rsk(req);
1771 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1772 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1773 	newsk->sk_bound_dev_if = ireq->ir_iif;
1774 	newinet->inet_saddr   = ireq->ir_loc_addr;
1775 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1776 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1777 	newinet->mc_index     = inet_iif(skb);
1778 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1779 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1780 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1781 	if (inet_opt)
1782 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1783 	atomic_set(&newinet->inet_id, get_random_u16());
1784 
1785 	/* Set ToS of the new socket based upon the value of incoming SYN.
1786 	 * ECT bits are set later in tcp_init_transfer().
1787 	 */
1788 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1789 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1790 
1791 	if (!dst) {
1792 		dst = inet_csk_route_child_sock(sk, newsk, req);
1793 		if (!dst)
1794 			goto put_and_exit;
1795 	} else {
1796 		/* syncookie case : see end of cookie_v4_check() */
1797 	}
1798 	sk_setup_caps(newsk, dst);
1799 
1800 	tcp_ca_openreq_child(newsk, dst);
1801 
1802 	tcp_sync_mss(newsk, dst_mtu(dst));
1803 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1804 
1805 	tcp_initialize_rcv_mss(newsk);
1806 
1807 #ifdef CONFIG_TCP_MD5SIG
1808 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1809 	/* Copy over the MD5 key from the original socket */
1810 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1811 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1812 	if (key && !tcp_rsk_used_ao(req)) {
1813 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1814 			goto put_and_exit;
1815 		sk_gso_disable(newsk);
1816 	}
1817 #endif
1818 #ifdef CONFIG_TCP_AO
1819 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1820 		goto put_and_exit; /* OOM, release back memory */
1821 #endif
1822 
1823 	if (__inet_inherit_port(sk, newsk) < 0)
1824 		goto put_and_exit;
1825 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1826 				       &found_dup_sk);
1827 	if (likely(*own_req)) {
1828 		tcp_move_syn(newtp, req);
1829 		ireq->ireq_opt = NULL;
1830 	} else {
1831 		newinet->inet_opt = NULL;
1832 
1833 		if (!req_unhash && found_dup_sk) {
1834 			/* This code path should only be executed in the
1835 			 * syncookie case only
1836 			 */
1837 			bh_unlock_sock(newsk);
1838 			sock_put(newsk);
1839 			newsk = NULL;
1840 		}
1841 	}
1842 	return newsk;
1843 
1844 exit_overflow:
1845 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1846 exit_nonewsk:
1847 	dst_release(dst);
1848 exit:
1849 	tcp_listendrop(sk);
1850 	return NULL;
1851 put_and_exit:
1852 	newinet->inet_opt = NULL;
1853 	inet_csk_prepare_forced_close(newsk);
1854 	tcp_done(newsk);
1855 	goto exit;
1856 }
1857 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1858 
1859 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1860 {
1861 #ifdef CONFIG_SYN_COOKIES
1862 	const struct tcphdr *th = tcp_hdr(skb);
1863 
1864 	if (!th->syn)
1865 		sk = cookie_v4_check(sk, skb);
1866 #endif
1867 	return sk;
1868 }
1869 
1870 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1871 			 struct tcphdr *th, u32 *cookie)
1872 {
1873 	u16 mss = 0;
1874 #ifdef CONFIG_SYN_COOKIES
1875 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1876 				    &tcp_request_sock_ipv4_ops, sk, th);
1877 	if (mss) {
1878 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1879 		tcp_synq_overflow(sk);
1880 	}
1881 #endif
1882 	return mss;
1883 }
1884 
1885 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1886 							   u32));
1887 /* The socket must have it's spinlock held when we get
1888  * here, unless it is a TCP_LISTEN socket.
1889  *
1890  * We have a potential double-lock case here, so even when
1891  * doing backlog processing we use the BH locking scheme.
1892  * This is because we cannot sleep with the original spinlock
1893  * held.
1894  */
1895 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1896 {
1897 	enum skb_drop_reason reason;
1898 	struct sock *rsk;
1899 
1900 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1901 		struct dst_entry *dst;
1902 
1903 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1904 						lockdep_sock_is_held(sk));
1905 
1906 		sock_rps_save_rxhash(sk, skb);
1907 		sk_mark_napi_id(sk, skb);
1908 		if (dst) {
1909 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1910 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1911 					     dst, 0)) {
1912 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1913 				dst_release(dst);
1914 			}
1915 		}
1916 		tcp_rcv_established(sk, skb);
1917 		return 0;
1918 	}
1919 
1920 	if (tcp_checksum_complete(skb))
1921 		goto csum_err;
1922 
1923 	if (sk->sk_state == TCP_LISTEN) {
1924 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1925 
1926 		if (!nsk)
1927 			return 0;
1928 		if (nsk != sk) {
1929 			reason = tcp_child_process(sk, nsk, skb);
1930 			if (reason) {
1931 				rsk = nsk;
1932 				goto reset;
1933 			}
1934 			return 0;
1935 		}
1936 	} else
1937 		sock_rps_save_rxhash(sk, skb);
1938 
1939 	reason = tcp_rcv_state_process(sk, skb);
1940 	if (reason) {
1941 		rsk = sk;
1942 		goto reset;
1943 	}
1944 	return 0;
1945 
1946 reset:
1947 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1948 discard:
1949 	sk_skb_reason_drop(sk, skb, reason);
1950 	/* Be careful here. If this function gets more complicated and
1951 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1952 	 * might be destroyed here. This current version compiles correctly,
1953 	 * but you have been warned.
1954 	 */
1955 	return 0;
1956 
1957 csum_err:
1958 	reason = SKB_DROP_REASON_TCP_CSUM;
1959 	trace_tcp_bad_csum(skb);
1960 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1961 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1962 	goto discard;
1963 }
1964 EXPORT_SYMBOL(tcp_v4_do_rcv);
1965 
1966 int tcp_v4_early_demux(struct sk_buff *skb)
1967 {
1968 	struct net *net = dev_net(skb->dev);
1969 	const struct iphdr *iph;
1970 	const struct tcphdr *th;
1971 	struct sock *sk;
1972 
1973 	if (skb->pkt_type != PACKET_HOST)
1974 		return 0;
1975 
1976 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1977 		return 0;
1978 
1979 	iph = ip_hdr(skb);
1980 	th = tcp_hdr(skb);
1981 
1982 	if (th->doff < sizeof(struct tcphdr) / 4)
1983 		return 0;
1984 
1985 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1986 				       iph->saddr, th->source,
1987 				       iph->daddr, ntohs(th->dest),
1988 				       skb->skb_iif, inet_sdif(skb));
1989 	if (sk) {
1990 		skb->sk = sk;
1991 		skb->destructor = sock_edemux;
1992 		if (sk_fullsock(sk)) {
1993 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1994 
1995 			if (dst)
1996 				dst = dst_check(dst, 0);
1997 			if (dst &&
1998 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1999 				skb_dst_set_noref(skb, dst);
2000 		}
2001 	}
2002 	return 0;
2003 }
2004 
2005 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2006 		     enum skb_drop_reason *reason)
2007 {
2008 	u32 tail_gso_size, tail_gso_segs;
2009 	struct skb_shared_info *shinfo;
2010 	const struct tcphdr *th;
2011 	struct tcphdr *thtail;
2012 	struct sk_buff *tail;
2013 	unsigned int hdrlen;
2014 	bool fragstolen;
2015 	u32 gso_segs;
2016 	u32 gso_size;
2017 	u64 limit;
2018 	int delta;
2019 
2020 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2021 	 * we can fix skb->truesize to its real value to avoid future drops.
2022 	 * This is valid because skb is not yet charged to the socket.
2023 	 * It has been noticed pure SACK packets were sometimes dropped
2024 	 * (if cooked by drivers without copybreak feature).
2025 	 */
2026 	skb_condense(skb);
2027 
2028 	skb_dst_drop(skb);
2029 
2030 	if (unlikely(tcp_checksum_complete(skb))) {
2031 		bh_unlock_sock(sk);
2032 		trace_tcp_bad_csum(skb);
2033 		*reason = SKB_DROP_REASON_TCP_CSUM;
2034 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2035 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2036 		return true;
2037 	}
2038 
2039 	/* Attempt coalescing to last skb in backlog, even if we are
2040 	 * above the limits.
2041 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2042 	 */
2043 	th = (const struct tcphdr *)skb->data;
2044 	hdrlen = th->doff * 4;
2045 
2046 	tail = sk->sk_backlog.tail;
2047 	if (!tail)
2048 		goto no_coalesce;
2049 	thtail = (struct tcphdr *)tail->data;
2050 
2051 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2052 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2053 	    ((TCP_SKB_CB(tail)->tcp_flags |
2054 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2055 	    !((TCP_SKB_CB(tail)->tcp_flags &
2056 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2057 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2058 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2059 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2060 	    thtail->doff != th->doff ||
2061 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2062 		goto no_coalesce;
2063 
2064 	__skb_pull(skb, hdrlen);
2065 
2066 	shinfo = skb_shinfo(skb);
2067 	gso_size = shinfo->gso_size ?: skb->len;
2068 	gso_segs = shinfo->gso_segs ?: 1;
2069 
2070 	shinfo = skb_shinfo(tail);
2071 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2072 	tail_gso_segs = shinfo->gso_segs ?: 1;
2073 
2074 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2075 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2076 
2077 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2078 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2079 			thtail->window = th->window;
2080 		}
2081 
2082 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2083 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2084 		 * is not entered if we append a packet with a FIN.
2085 		 * SYN, RST, URG are not present.
2086 		 * ACK is set on both packets.
2087 		 * PSH : we do not really care in TCP stack,
2088 		 *       at least for 'GRO' packets.
2089 		 */
2090 		thtail->fin |= th->fin;
2091 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2092 
2093 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2094 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2095 			tail->tstamp = skb->tstamp;
2096 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2097 		}
2098 
2099 		/* Not as strict as GRO. We only need to carry mss max value */
2100 		shinfo->gso_size = max(gso_size, tail_gso_size);
2101 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2102 
2103 		sk->sk_backlog.len += delta;
2104 		__NET_INC_STATS(sock_net(sk),
2105 				LINUX_MIB_TCPBACKLOGCOALESCE);
2106 		kfree_skb_partial(skb, fragstolen);
2107 		return false;
2108 	}
2109 	__skb_push(skb, hdrlen);
2110 
2111 no_coalesce:
2112 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2113 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2114 	 * sk_rcvbuf in normal conditions.
2115 	 */
2116 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2117 
2118 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2119 
2120 	/* Only socket owner can try to collapse/prune rx queues
2121 	 * to reduce memory overhead, so add a little headroom here.
2122 	 * Few sockets backlog are possibly concurrently non empty.
2123 	 */
2124 	limit += 64 * 1024;
2125 
2126 	limit = min_t(u64, limit, UINT_MAX);
2127 
2128 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2129 		bh_unlock_sock(sk);
2130 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2131 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2132 		return true;
2133 	}
2134 	return false;
2135 }
2136 EXPORT_SYMBOL(tcp_add_backlog);
2137 
2138 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2139 {
2140 	struct tcphdr *th = (struct tcphdr *)skb->data;
2141 
2142 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2143 }
2144 EXPORT_SYMBOL(tcp_filter);
2145 
2146 static void tcp_v4_restore_cb(struct sk_buff *skb)
2147 {
2148 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2149 		sizeof(struct inet_skb_parm));
2150 }
2151 
2152 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2153 			   const struct tcphdr *th)
2154 {
2155 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2156 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2157 	 */
2158 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2159 		sizeof(struct inet_skb_parm));
2160 	barrier();
2161 
2162 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2163 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2164 				    skb->len - th->doff * 4);
2165 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2166 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2167 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2168 	TCP_SKB_CB(skb)->sacked	 = 0;
2169 	TCP_SKB_CB(skb)->has_rxtstamp =
2170 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2171 }
2172 
2173 /*
2174  *	From tcp_input.c
2175  */
2176 
2177 int tcp_v4_rcv(struct sk_buff *skb)
2178 {
2179 	struct net *net = dev_net(skb->dev);
2180 	enum skb_drop_reason drop_reason;
2181 	int sdif = inet_sdif(skb);
2182 	int dif = inet_iif(skb);
2183 	const struct iphdr *iph;
2184 	const struct tcphdr *th;
2185 	struct sock *sk = NULL;
2186 	bool refcounted;
2187 	int ret;
2188 	u32 isn;
2189 
2190 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2191 	if (skb->pkt_type != PACKET_HOST)
2192 		goto discard_it;
2193 
2194 	/* Count it even if it's bad */
2195 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2196 
2197 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2198 		goto discard_it;
2199 
2200 	th = (const struct tcphdr *)skb->data;
2201 
2202 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2203 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2204 		goto bad_packet;
2205 	}
2206 	if (!pskb_may_pull(skb, th->doff * 4))
2207 		goto discard_it;
2208 
2209 	/* An explanation is required here, I think.
2210 	 * Packet length and doff are validated by header prediction,
2211 	 * provided case of th->doff==0 is eliminated.
2212 	 * So, we defer the checks. */
2213 
2214 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2215 		goto csum_error;
2216 
2217 	th = (const struct tcphdr *)skb->data;
2218 	iph = ip_hdr(skb);
2219 lookup:
2220 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2221 			       skb, __tcp_hdrlen(th), th->source,
2222 			       th->dest, sdif, &refcounted);
2223 	if (!sk)
2224 		goto no_tcp_socket;
2225 
2226 	if (sk->sk_state == TCP_TIME_WAIT)
2227 		goto do_time_wait;
2228 
2229 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2230 		struct request_sock *req = inet_reqsk(sk);
2231 		bool req_stolen = false;
2232 		struct sock *nsk;
2233 
2234 		sk = req->rsk_listener;
2235 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2236 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2237 		else
2238 			drop_reason = tcp_inbound_hash(sk, req, skb,
2239 						       &iph->saddr, &iph->daddr,
2240 						       AF_INET, dif, sdif);
2241 		if (unlikely(drop_reason)) {
2242 			sk_drops_add(sk, skb);
2243 			reqsk_put(req);
2244 			goto discard_it;
2245 		}
2246 		if (tcp_checksum_complete(skb)) {
2247 			reqsk_put(req);
2248 			goto csum_error;
2249 		}
2250 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2251 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2252 			if (!nsk) {
2253 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2254 				goto lookup;
2255 			}
2256 			sk = nsk;
2257 			/* reuseport_migrate_sock() has already held one sk_refcnt
2258 			 * before returning.
2259 			 */
2260 		} else {
2261 			/* We own a reference on the listener, increase it again
2262 			 * as we might lose it too soon.
2263 			 */
2264 			sock_hold(sk);
2265 		}
2266 		refcounted = true;
2267 		nsk = NULL;
2268 		if (!tcp_filter(sk, skb)) {
2269 			th = (const struct tcphdr *)skb->data;
2270 			iph = ip_hdr(skb);
2271 			tcp_v4_fill_cb(skb, iph, th);
2272 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2273 		} else {
2274 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2275 		}
2276 		if (!nsk) {
2277 			reqsk_put(req);
2278 			if (req_stolen) {
2279 				/* Another cpu got exclusive access to req
2280 				 * and created a full blown socket.
2281 				 * Try to feed this packet to this socket
2282 				 * instead of discarding it.
2283 				 */
2284 				tcp_v4_restore_cb(skb);
2285 				sock_put(sk);
2286 				goto lookup;
2287 			}
2288 			goto discard_and_relse;
2289 		}
2290 		nf_reset_ct(skb);
2291 		if (nsk == sk) {
2292 			reqsk_put(req);
2293 			tcp_v4_restore_cb(skb);
2294 		} else {
2295 			drop_reason = tcp_child_process(sk, nsk, skb);
2296 			if (drop_reason) {
2297 				enum sk_rst_reason rst_reason;
2298 
2299 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2300 				tcp_v4_send_reset(nsk, skb, rst_reason);
2301 				goto discard_and_relse;
2302 			}
2303 			sock_put(sk);
2304 			return 0;
2305 		}
2306 	}
2307 
2308 process:
2309 	if (static_branch_unlikely(&ip4_min_ttl)) {
2310 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2311 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2312 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2313 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2314 			goto discard_and_relse;
2315 		}
2316 	}
2317 
2318 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2319 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2320 		goto discard_and_relse;
2321 	}
2322 
2323 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2324 				       AF_INET, dif, sdif);
2325 	if (drop_reason)
2326 		goto discard_and_relse;
2327 
2328 	nf_reset_ct(skb);
2329 
2330 	if (tcp_filter(sk, skb)) {
2331 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2332 		goto discard_and_relse;
2333 	}
2334 	th = (const struct tcphdr *)skb->data;
2335 	iph = ip_hdr(skb);
2336 	tcp_v4_fill_cb(skb, iph, th);
2337 
2338 	skb->dev = NULL;
2339 
2340 	if (sk->sk_state == TCP_LISTEN) {
2341 		ret = tcp_v4_do_rcv(sk, skb);
2342 		goto put_and_return;
2343 	}
2344 
2345 	sk_incoming_cpu_update(sk);
2346 
2347 	bh_lock_sock_nested(sk);
2348 	tcp_segs_in(tcp_sk(sk), skb);
2349 	ret = 0;
2350 	if (!sock_owned_by_user(sk)) {
2351 		ret = tcp_v4_do_rcv(sk, skb);
2352 	} else {
2353 		if (tcp_add_backlog(sk, skb, &drop_reason))
2354 			goto discard_and_relse;
2355 	}
2356 	bh_unlock_sock(sk);
2357 
2358 put_and_return:
2359 	if (refcounted)
2360 		sock_put(sk);
2361 
2362 	return ret;
2363 
2364 no_tcp_socket:
2365 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2366 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2367 		goto discard_it;
2368 
2369 	tcp_v4_fill_cb(skb, iph, th);
2370 
2371 	if (tcp_checksum_complete(skb)) {
2372 csum_error:
2373 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2374 		trace_tcp_bad_csum(skb);
2375 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2376 bad_packet:
2377 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2378 	} else {
2379 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2380 	}
2381 
2382 discard_it:
2383 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2384 	/* Discard frame. */
2385 	sk_skb_reason_drop(sk, skb, drop_reason);
2386 	return 0;
2387 
2388 discard_and_relse:
2389 	sk_drops_add(sk, skb);
2390 	if (refcounted)
2391 		sock_put(sk);
2392 	goto discard_it;
2393 
2394 do_time_wait:
2395 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2396 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2397 		inet_twsk_put(inet_twsk(sk));
2398 		goto discard_it;
2399 	}
2400 
2401 	tcp_v4_fill_cb(skb, iph, th);
2402 
2403 	if (tcp_checksum_complete(skb)) {
2404 		inet_twsk_put(inet_twsk(sk));
2405 		goto csum_error;
2406 	}
2407 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2408 	case TCP_TW_SYN: {
2409 		struct sock *sk2 = inet_lookup_listener(net,
2410 							net->ipv4.tcp_death_row.hashinfo,
2411 							skb, __tcp_hdrlen(th),
2412 							iph->saddr, th->source,
2413 							iph->daddr, th->dest,
2414 							inet_iif(skb),
2415 							sdif);
2416 		if (sk2) {
2417 			inet_twsk_deschedule_put(inet_twsk(sk));
2418 			sk = sk2;
2419 			tcp_v4_restore_cb(skb);
2420 			refcounted = false;
2421 			__this_cpu_write(tcp_tw_isn, isn);
2422 			goto process;
2423 		}
2424 	}
2425 		/* to ACK */
2426 		fallthrough;
2427 	case TCP_TW_ACK:
2428 		tcp_v4_timewait_ack(sk, skb);
2429 		break;
2430 	case TCP_TW_RST:
2431 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2432 		inet_twsk_deschedule_put(inet_twsk(sk));
2433 		goto discard_it;
2434 	case TCP_TW_SUCCESS:;
2435 	}
2436 	goto discard_it;
2437 }
2438 
2439 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2440 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2441 	.twsk_destructor= tcp_twsk_destructor,
2442 };
2443 
2444 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2445 {
2446 	struct dst_entry *dst = skb_dst(skb);
2447 
2448 	if (dst && dst_hold_safe(dst)) {
2449 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2450 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2451 	}
2452 }
2453 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2454 
2455 const struct inet_connection_sock_af_ops ipv4_specific = {
2456 	.queue_xmit	   = ip_queue_xmit,
2457 	.send_check	   = tcp_v4_send_check,
2458 	.rebuild_header	   = inet_sk_rebuild_header,
2459 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2460 	.conn_request	   = tcp_v4_conn_request,
2461 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2462 	.net_header_len	   = sizeof(struct iphdr),
2463 	.setsockopt	   = ip_setsockopt,
2464 	.getsockopt	   = ip_getsockopt,
2465 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2466 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2467 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2468 };
2469 EXPORT_SYMBOL(ipv4_specific);
2470 
2471 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2472 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2473 #ifdef CONFIG_TCP_MD5SIG
2474 	.md5_lookup		= tcp_v4_md5_lookup,
2475 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2476 	.md5_parse		= tcp_v4_parse_md5_keys,
2477 #endif
2478 #ifdef CONFIG_TCP_AO
2479 	.ao_lookup		= tcp_v4_ao_lookup,
2480 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2481 	.ao_parse		= tcp_v4_parse_ao,
2482 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2483 #endif
2484 };
2485 #endif
2486 
2487 /* NOTE: A lot of things set to zero explicitly by call to
2488  *       sk_alloc() so need not be done here.
2489  */
2490 static int tcp_v4_init_sock(struct sock *sk)
2491 {
2492 	struct inet_connection_sock *icsk = inet_csk(sk);
2493 
2494 	tcp_init_sock(sk);
2495 
2496 	icsk->icsk_af_ops = &ipv4_specific;
2497 
2498 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2499 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2500 #endif
2501 
2502 	return 0;
2503 }
2504 
2505 #ifdef CONFIG_TCP_MD5SIG
2506 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2507 {
2508 	struct tcp_md5sig_info *md5sig;
2509 
2510 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2511 	kfree(md5sig);
2512 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2513 	tcp_md5_release_sigpool();
2514 }
2515 #endif
2516 
2517 static void tcp_release_user_frags(struct sock *sk)
2518 {
2519 #ifdef CONFIG_PAGE_POOL
2520 	unsigned long index;
2521 	void *netmem;
2522 
2523 	xa_for_each(&sk->sk_user_frags, index, netmem)
2524 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2525 #endif
2526 }
2527 
2528 void tcp_v4_destroy_sock(struct sock *sk)
2529 {
2530 	struct tcp_sock *tp = tcp_sk(sk);
2531 
2532 	tcp_release_user_frags(sk);
2533 
2534 	xa_destroy(&sk->sk_user_frags);
2535 
2536 	trace_tcp_destroy_sock(sk);
2537 
2538 	tcp_clear_xmit_timers(sk);
2539 
2540 	tcp_cleanup_congestion_control(sk);
2541 
2542 	tcp_cleanup_ulp(sk);
2543 
2544 	/* Cleanup up the write buffer. */
2545 	tcp_write_queue_purge(sk);
2546 
2547 	/* Check if we want to disable active TFO */
2548 	tcp_fastopen_active_disable_ofo_check(sk);
2549 
2550 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2551 	skb_rbtree_purge(&tp->out_of_order_queue);
2552 
2553 #ifdef CONFIG_TCP_MD5SIG
2554 	/* Clean up the MD5 key list, if any */
2555 	if (tp->md5sig_info) {
2556 		struct tcp_md5sig_info *md5sig;
2557 
2558 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2559 		tcp_clear_md5_list(sk);
2560 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2561 		rcu_assign_pointer(tp->md5sig_info, NULL);
2562 	}
2563 #endif
2564 	tcp_ao_destroy_sock(sk, false);
2565 
2566 	/* Clean up a referenced TCP bind bucket. */
2567 	if (inet_csk(sk)->icsk_bind_hash)
2568 		inet_put_port(sk);
2569 
2570 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2571 
2572 	/* If socket is aborted during connect operation */
2573 	tcp_free_fastopen_req(tp);
2574 	tcp_fastopen_destroy_cipher(sk);
2575 	tcp_saved_syn_free(tp);
2576 
2577 	sk_sockets_allocated_dec(sk);
2578 }
2579 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2580 
2581 #ifdef CONFIG_PROC_FS
2582 /* Proc filesystem TCP sock list dumping. */
2583 
2584 static unsigned short seq_file_family(const struct seq_file *seq);
2585 
2586 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2587 {
2588 	unsigned short family = seq_file_family(seq);
2589 
2590 	/* AF_UNSPEC is used as a match all */
2591 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2592 		net_eq(sock_net(sk), seq_file_net(seq)));
2593 }
2594 
2595 /* Find a non empty bucket (starting from st->bucket)
2596  * and return the first sk from it.
2597  */
2598 static void *listening_get_first(struct seq_file *seq)
2599 {
2600 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2601 	struct tcp_iter_state *st = seq->private;
2602 
2603 	st->offset = 0;
2604 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2605 		struct inet_listen_hashbucket *ilb2;
2606 		struct hlist_nulls_node *node;
2607 		struct sock *sk;
2608 
2609 		ilb2 = &hinfo->lhash2[st->bucket];
2610 		if (hlist_nulls_empty(&ilb2->nulls_head))
2611 			continue;
2612 
2613 		spin_lock(&ilb2->lock);
2614 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2615 			if (seq_sk_match(seq, sk))
2616 				return sk;
2617 		}
2618 		spin_unlock(&ilb2->lock);
2619 	}
2620 
2621 	return NULL;
2622 }
2623 
2624 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2625  * If "cur" is the last one in the st->bucket,
2626  * call listening_get_first() to return the first sk of the next
2627  * non empty bucket.
2628  */
2629 static void *listening_get_next(struct seq_file *seq, void *cur)
2630 {
2631 	struct tcp_iter_state *st = seq->private;
2632 	struct inet_listen_hashbucket *ilb2;
2633 	struct hlist_nulls_node *node;
2634 	struct inet_hashinfo *hinfo;
2635 	struct sock *sk = cur;
2636 
2637 	++st->num;
2638 	++st->offset;
2639 
2640 	sk = sk_nulls_next(sk);
2641 	sk_nulls_for_each_from(sk, node) {
2642 		if (seq_sk_match(seq, sk))
2643 			return sk;
2644 	}
2645 
2646 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2647 	ilb2 = &hinfo->lhash2[st->bucket];
2648 	spin_unlock(&ilb2->lock);
2649 	++st->bucket;
2650 	return listening_get_first(seq);
2651 }
2652 
2653 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2654 {
2655 	struct tcp_iter_state *st = seq->private;
2656 	void *rc;
2657 
2658 	st->bucket = 0;
2659 	st->offset = 0;
2660 	rc = listening_get_first(seq);
2661 
2662 	while (rc && *pos) {
2663 		rc = listening_get_next(seq, rc);
2664 		--*pos;
2665 	}
2666 	return rc;
2667 }
2668 
2669 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2670 				const struct tcp_iter_state *st)
2671 {
2672 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2673 }
2674 
2675 /*
2676  * Get first established socket starting from bucket given in st->bucket.
2677  * If st->bucket is zero, the very first socket in the hash is returned.
2678  */
2679 static void *established_get_first(struct seq_file *seq)
2680 {
2681 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2682 	struct tcp_iter_state *st = seq->private;
2683 
2684 	st->offset = 0;
2685 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2686 		struct sock *sk;
2687 		struct hlist_nulls_node *node;
2688 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2689 
2690 		cond_resched();
2691 
2692 		/* Lockless fast path for the common case of empty buckets */
2693 		if (empty_bucket(hinfo, st))
2694 			continue;
2695 
2696 		spin_lock_bh(lock);
2697 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2698 			if (seq_sk_match(seq, sk))
2699 				return sk;
2700 		}
2701 		spin_unlock_bh(lock);
2702 	}
2703 
2704 	return NULL;
2705 }
2706 
2707 static void *established_get_next(struct seq_file *seq, void *cur)
2708 {
2709 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2710 	struct tcp_iter_state *st = seq->private;
2711 	struct hlist_nulls_node *node;
2712 	struct sock *sk = cur;
2713 
2714 	++st->num;
2715 	++st->offset;
2716 
2717 	sk = sk_nulls_next(sk);
2718 
2719 	sk_nulls_for_each_from(sk, node) {
2720 		if (seq_sk_match(seq, sk))
2721 			return sk;
2722 	}
2723 
2724 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2725 	++st->bucket;
2726 	return established_get_first(seq);
2727 }
2728 
2729 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2730 {
2731 	struct tcp_iter_state *st = seq->private;
2732 	void *rc;
2733 
2734 	st->bucket = 0;
2735 	rc = established_get_first(seq);
2736 
2737 	while (rc && pos) {
2738 		rc = established_get_next(seq, rc);
2739 		--pos;
2740 	}
2741 	return rc;
2742 }
2743 
2744 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2745 {
2746 	void *rc;
2747 	struct tcp_iter_state *st = seq->private;
2748 
2749 	st->state = TCP_SEQ_STATE_LISTENING;
2750 	rc	  = listening_get_idx(seq, &pos);
2751 
2752 	if (!rc) {
2753 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2754 		rc	  = established_get_idx(seq, pos);
2755 	}
2756 
2757 	return rc;
2758 }
2759 
2760 static void *tcp_seek_last_pos(struct seq_file *seq)
2761 {
2762 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2763 	struct tcp_iter_state *st = seq->private;
2764 	int bucket = st->bucket;
2765 	int offset = st->offset;
2766 	int orig_num = st->num;
2767 	void *rc = NULL;
2768 
2769 	switch (st->state) {
2770 	case TCP_SEQ_STATE_LISTENING:
2771 		if (st->bucket > hinfo->lhash2_mask)
2772 			break;
2773 		rc = listening_get_first(seq);
2774 		while (offset-- && rc && bucket == st->bucket)
2775 			rc = listening_get_next(seq, rc);
2776 		if (rc)
2777 			break;
2778 		st->bucket = 0;
2779 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2780 		fallthrough;
2781 	case TCP_SEQ_STATE_ESTABLISHED:
2782 		if (st->bucket > hinfo->ehash_mask)
2783 			break;
2784 		rc = established_get_first(seq);
2785 		while (offset-- && rc && bucket == st->bucket)
2786 			rc = established_get_next(seq, rc);
2787 	}
2788 
2789 	st->num = orig_num;
2790 
2791 	return rc;
2792 }
2793 
2794 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2795 {
2796 	struct tcp_iter_state *st = seq->private;
2797 	void *rc;
2798 
2799 	if (*pos && *pos == st->last_pos) {
2800 		rc = tcp_seek_last_pos(seq);
2801 		if (rc)
2802 			goto out;
2803 	}
2804 
2805 	st->state = TCP_SEQ_STATE_LISTENING;
2806 	st->num = 0;
2807 	st->bucket = 0;
2808 	st->offset = 0;
2809 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2810 
2811 out:
2812 	st->last_pos = *pos;
2813 	return rc;
2814 }
2815 EXPORT_SYMBOL(tcp_seq_start);
2816 
2817 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2818 {
2819 	struct tcp_iter_state *st = seq->private;
2820 	void *rc = NULL;
2821 
2822 	if (v == SEQ_START_TOKEN) {
2823 		rc = tcp_get_idx(seq, 0);
2824 		goto out;
2825 	}
2826 
2827 	switch (st->state) {
2828 	case TCP_SEQ_STATE_LISTENING:
2829 		rc = listening_get_next(seq, v);
2830 		if (!rc) {
2831 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2832 			st->bucket = 0;
2833 			st->offset = 0;
2834 			rc	  = established_get_first(seq);
2835 		}
2836 		break;
2837 	case TCP_SEQ_STATE_ESTABLISHED:
2838 		rc = established_get_next(seq, v);
2839 		break;
2840 	}
2841 out:
2842 	++*pos;
2843 	st->last_pos = *pos;
2844 	return rc;
2845 }
2846 EXPORT_SYMBOL(tcp_seq_next);
2847 
2848 void tcp_seq_stop(struct seq_file *seq, void *v)
2849 {
2850 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2851 	struct tcp_iter_state *st = seq->private;
2852 
2853 	switch (st->state) {
2854 	case TCP_SEQ_STATE_LISTENING:
2855 		if (v != SEQ_START_TOKEN)
2856 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2857 		break;
2858 	case TCP_SEQ_STATE_ESTABLISHED:
2859 		if (v)
2860 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2861 		break;
2862 	}
2863 }
2864 EXPORT_SYMBOL(tcp_seq_stop);
2865 
2866 static void get_openreq4(const struct request_sock *req,
2867 			 struct seq_file *f, int i)
2868 {
2869 	const struct inet_request_sock *ireq = inet_rsk(req);
2870 	long delta = req->rsk_timer.expires - jiffies;
2871 
2872 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2873 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2874 		i,
2875 		ireq->ir_loc_addr,
2876 		ireq->ir_num,
2877 		ireq->ir_rmt_addr,
2878 		ntohs(ireq->ir_rmt_port),
2879 		TCP_SYN_RECV,
2880 		0, 0, /* could print option size, but that is af dependent. */
2881 		1,    /* timers active (only the expire timer) */
2882 		jiffies_delta_to_clock_t(delta),
2883 		req->num_timeout,
2884 		from_kuid_munged(seq_user_ns(f),
2885 				 sock_i_uid(req->rsk_listener)),
2886 		0,  /* non standard timer */
2887 		0, /* open_requests have no inode */
2888 		0,
2889 		req);
2890 }
2891 
2892 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2893 {
2894 	int timer_active;
2895 	unsigned long timer_expires;
2896 	const struct tcp_sock *tp = tcp_sk(sk);
2897 	const struct inet_connection_sock *icsk = inet_csk(sk);
2898 	const struct inet_sock *inet = inet_sk(sk);
2899 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2900 	__be32 dest = inet->inet_daddr;
2901 	__be32 src = inet->inet_rcv_saddr;
2902 	__u16 destp = ntohs(inet->inet_dport);
2903 	__u16 srcp = ntohs(inet->inet_sport);
2904 	u8 icsk_pending;
2905 	int rx_queue;
2906 	int state;
2907 
2908 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2909 	if (icsk_pending == ICSK_TIME_RETRANS ||
2910 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2911 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2912 		timer_active	= 1;
2913 		timer_expires	= icsk->icsk_timeout;
2914 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2915 		timer_active	= 4;
2916 		timer_expires	= icsk->icsk_timeout;
2917 	} else if (timer_pending(&sk->sk_timer)) {
2918 		timer_active	= 2;
2919 		timer_expires	= sk->sk_timer.expires;
2920 	} else {
2921 		timer_active	= 0;
2922 		timer_expires = jiffies;
2923 	}
2924 
2925 	state = inet_sk_state_load(sk);
2926 	if (state == TCP_LISTEN)
2927 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2928 	else
2929 		/* Because we don't lock the socket,
2930 		 * we might find a transient negative value.
2931 		 */
2932 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2933 				      READ_ONCE(tp->copied_seq), 0);
2934 
2935 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2936 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2937 		i, src, srcp, dest, destp, state,
2938 		READ_ONCE(tp->write_seq) - tp->snd_una,
2939 		rx_queue,
2940 		timer_active,
2941 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2942 		icsk->icsk_retransmits,
2943 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2944 		icsk->icsk_probes_out,
2945 		sock_i_ino(sk),
2946 		refcount_read(&sk->sk_refcnt), sk,
2947 		jiffies_to_clock_t(icsk->icsk_rto),
2948 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2949 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2950 		tcp_snd_cwnd(tp),
2951 		state == TCP_LISTEN ?
2952 		    fastopenq->max_qlen :
2953 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2954 }
2955 
2956 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2957 			       struct seq_file *f, int i)
2958 {
2959 	long delta = tw->tw_timer.expires - jiffies;
2960 	__be32 dest, src;
2961 	__u16 destp, srcp;
2962 
2963 	dest  = tw->tw_daddr;
2964 	src   = tw->tw_rcv_saddr;
2965 	destp = ntohs(tw->tw_dport);
2966 	srcp  = ntohs(tw->tw_sport);
2967 
2968 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2969 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2970 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2971 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2972 		refcount_read(&tw->tw_refcnt), tw);
2973 }
2974 
2975 #define TMPSZ 150
2976 
2977 static int tcp4_seq_show(struct seq_file *seq, void *v)
2978 {
2979 	struct tcp_iter_state *st;
2980 	struct sock *sk = v;
2981 
2982 	seq_setwidth(seq, TMPSZ - 1);
2983 	if (v == SEQ_START_TOKEN) {
2984 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2985 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2986 			   "inode");
2987 		goto out;
2988 	}
2989 	st = seq->private;
2990 
2991 	if (sk->sk_state == TCP_TIME_WAIT)
2992 		get_timewait4_sock(v, seq, st->num);
2993 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2994 		get_openreq4(v, seq, st->num);
2995 	else
2996 		get_tcp4_sock(v, seq, st->num);
2997 out:
2998 	seq_pad(seq, '\n');
2999 	return 0;
3000 }
3001 
3002 #ifdef CONFIG_BPF_SYSCALL
3003 struct bpf_tcp_iter_state {
3004 	struct tcp_iter_state state;
3005 	unsigned int cur_sk;
3006 	unsigned int end_sk;
3007 	unsigned int max_sk;
3008 	struct sock **batch;
3009 	bool st_bucket_done;
3010 };
3011 
3012 struct bpf_iter__tcp {
3013 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3014 	__bpf_md_ptr(struct sock_common *, sk_common);
3015 	uid_t uid __aligned(8);
3016 };
3017 
3018 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3019 			     struct sock_common *sk_common, uid_t uid)
3020 {
3021 	struct bpf_iter__tcp ctx;
3022 
3023 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3024 	ctx.meta = meta;
3025 	ctx.sk_common = sk_common;
3026 	ctx.uid = uid;
3027 	return bpf_iter_run_prog(prog, &ctx);
3028 }
3029 
3030 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3031 {
3032 	while (iter->cur_sk < iter->end_sk)
3033 		sock_gen_put(iter->batch[iter->cur_sk++]);
3034 }
3035 
3036 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3037 				      unsigned int new_batch_sz)
3038 {
3039 	struct sock **new_batch;
3040 
3041 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3042 			     GFP_USER | __GFP_NOWARN);
3043 	if (!new_batch)
3044 		return -ENOMEM;
3045 
3046 	bpf_iter_tcp_put_batch(iter);
3047 	kvfree(iter->batch);
3048 	iter->batch = new_batch;
3049 	iter->max_sk = new_batch_sz;
3050 
3051 	return 0;
3052 }
3053 
3054 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3055 						 struct sock *start_sk)
3056 {
3057 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3058 	struct bpf_tcp_iter_state *iter = seq->private;
3059 	struct tcp_iter_state *st = &iter->state;
3060 	struct hlist_nulls_node *node;
3061 	unsigned int expected = 1;
3062 	struct sock *sk;
3063 
3064 	sock_hold(start_sk);
3065 	iter->batch[iter->end_sk++] = start_sk;
3066 
3067 	sk = sk_nulls_next(start_sk);
3068 	sk_nulls_for_each_from(sk, node) {
3069 		if (seq_sk_match(seq, sk)) {
3070 			if (iter->end_sk < iter->max_sk) {
3071 				sock_hold(sk);
3072 				iter->batch[iter->end_sk++] = sk;
3073 			}
3074 			expected++;
3075 		}
3076 	}
3077 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3078 
3079 	return expected;
3080 }
3081 
3082 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3083 						   struct sock *start_sk)
3084 {
3085 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3086 	struct bpf_tcp_iter_state *iter = seq->private;
3087 	struct tcp_iter_state *st = &iter->state;
3088 	struct hlist_nulls_node *node;
3089 	unsigned int expected = 1;
3090 	struct sock *sk;
3091 
3092 	sock_hold(start_sk);
3093 	iter->batch[iter->end_sk++] = start_sk;
3094 
3095 	sk = sk_nulls_next(start_sk);
3096 	sk_nulls_for_each_from(sk, node) {
3097 		if (seq_sk_match(seq, sk)) {
3098 			if (iter->end_sk < iter->max_sk) {
3099 				sock_hold(sk);
3100 				iter->batch[iter->end_sk++] = sk;
3101 			}
3102 			expected++;
3103 		}
3104 	}
3105 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3106 
3107 	return expected;
3108 }
3109 
3110 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3111 {
3112 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3113 	struct bpf_tcp_iter_state *iter = seq->private;
3114 	struct tcp_iter_state *st = &iter->state;
3115 	unsigned int expected;
3116 	bool resized = false;
3117 	struct sock *sk;
3118 
3119 	/* The st->bucket is done.  Directly advance to the next
3120 	 * bucket instead of having the tcp_seek_last_pos() to skip
3121 	 * one by one in the current bucket and eventually find out
3122 	 * it has to advance to the next bucket.
3123 	 */
3124 	if (iter->st_bucket_done) {
3125 		st->offset = 0;
3126 		st->bucket++;
3127 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3128 		    st->bucket > hinfo->lhash2_mask) {
3129 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3130 			st->bucket = 0;
3131 		}
3132 	}
3133 
3134 again:
3135 	/* Get a new batch */
3136 	iter->cur_sk = 0;
3137 	iter->end_sk = 0;
3138 	iter->st_bucket_done = false;
3139 
3140 	sk = tcp_seek_last_pos(seq);
3141 	if (!sk)
3142 		return NULL; /* Done */
3143 
3144 	if (st->state == TCP_SEQ_STATE_LISTENING)
3145 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3146 	else
3147 		expected = bpf_iter_tcp_established_batch(seq, sk);
3148 
3149 	if (iter->end_sk == expected) {
3150 		iter->st_bucket_done = true;
3151 		return sk;
3152 	}
3153 
3154 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3155 		resized = true;
3156 		goto again;
3157 	}
3158 
3159 	return sk;
3160 }
3161 
3162 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3163 {
3164 	/* bpf iter does not support lseek, so it always
3165 	 * continue from where it was stop()-ped.
3166 	 */
3167 	if (*pos)
3168 		return bpf_iter_tcp_batch(seq);
3169 
3170 	return SEQ_START_TOKEN;
3171 }
3172 
3173 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3174 {
3175 	struct bpf_tcp_iter_state *iter = seq->private;
3176 	struct tcp_iter_state *st = &iter->state;
3177 	struct sock *sk;
3178 
3179 	/* Whenever seq_next() is called, the iter->cur_sk is
3180 	 * done with seq_show(), so advance to the next sk in
3181 	 * the batch.
3182 	 */
3183 	if (iter->cur_sk < iter->end_sk) {
3184 		/* Keeping st->num consistent in tcp_iter_state.
3185 		 * bpf_iter_tcp does not use st->num.
3186 		 * meta.seq_num is used instead.
3187 		 */
3188 		st->num++;
3189 		/* Move st->offset to the next sk in the bucket such that
3190 		 * the future start() will resume at st->offset in
3191 		 * st->bucket.  See tcp_seek_last_pos().
3192 		 */
3193 		st->offset++;
3194 		sock_gen_put(iter->batch[iter->cur_sk++]);
3195 	}
3196 
3197 	if (iter->cur_sk < iter->end_sk)
3198 		sk = iter->batch[iter->cur_sk];
3199 	else
3200 		sk = bpf_iter_tcp_batch(seq);
3201 
3202 	++*pos;
3203 	/* Keeping st->last_pos consistent in tcp_iter_state.
3204 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3205 	 */
3206 	st->last_pos = *pos;
3207 	return sk;
3208 }
3209 
3210 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3211 {
3212 	struct bpf_iter_meta meta;
3213 	struct bpf_prog *prog;
3214 	struct sock *sk = v;
3215 	uid_t uid;
3216 	int ret;
3217 
3218 	if (v == SEQ_START_TOKEN)
3219 		return 0;
3220 
3221 	if (sk_fullsock(sk))
3222 		lock_sock(sk);
3223 
3224 	if (unlikely(sk_unhashed(sk))) {
3225 		ret = SEQ_SKIP;
3226 		goto unlock;
3227 	}
3228 
3229 	if (sk->sk_state == TCP_TIME_WAIT) {
3230 		uid = 0;
3231 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3232 		const struct request_sock *req = v;
3233 
3234 		uid = from_kuid_munged(seq_user_ns(seq),
3235 				       sock_i_uid(req->rsk_listener));
3236 	} else {
3237 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3238 	}
3239 
3240 	meta.seq = seq;
3241 	prog = bpf_iter_get_info(&meta, false);
3242 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3243 
3244 unlock:
3245 	if (sk_fullsock(sk))
3246 		release_sock(sk);
3247 	return ret;
3248 
3249 }
3250 
3251 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3252 {
3253 	struct bpf_tcp_iter_state *iter = seq->private;
3254 	struct bpf_iter_meta meta;
3255 	struct bpf_prog *prog;
3256 
3257 	if (!v) {
3258 		meta.seq = seq;
3259 		prog = bpf_iter_get_info(&meta, true);
3260 		if (prog)
3261 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3262 	}
3263 
3264 	if (iter->cur_sk < iter->end_sk) {
3265 		bpf_iter_tcp_put_batch(iter);
3266 		iter->st_bucket_done = false;
3267 	}
3268 }
3269 
3270 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3271 	.show		= bpf_iter_tcp_seq_show,
3272 	.start		= bpf_iter_tcp_seq_start,
3273 	.next		= bpf_iter_tcp_seq_next,
3274 	.stop		= bpf_iter_tcp_seq_stop,
3275 };
3276 #endif
3277 static unsigned short seq_file_family(const struct seq_file *seq)
3278 {
3279 	const struct tcp_seq_afinfo *afinfo;
3280 
3281 #ifdef CONFIG_BPF_SYSCALL
3282 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3283 	if (seq->op == &bpf_iter_tcp_seq_ops)
3284 		return AF_UNSPEC;
3285 #endif
3286 
3287 	/* Iterated from proc fs */
3288 	afinfo = pde_data(file_inode(seq->file));
3289 	return afinfo->family;
3290 }
3291 
3292 static const struct seq_operations tcp4_seq_ops = {
3293 	.show		= tcp4_seq_show,
3294 	.start		= tcp_seq_start,
3295 	.next		= tcp_seq_next,
3296 	.stop		= tcp_seq_stop,
3297 };
3298 
3299 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3300 	.family		= AF_INET,
3301 };
3302 
3303 static int __net_init tcp4_proc_init_net(struct net *net)
3304 {
3305 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3306 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3307 		return -ENOMEM;
3308 	return 0;
3309 }
3310 
3311 static void __net_exit tcp4_proc_exit_net(struct net *net)
3312 {
3313 	remove_proc_entry("tcp", net->proc_net);
3314 }
3315 
3316 static struct pernet_operations tcp4_net_ops = {
3317 	.init = tcp4_proc_init_net,
3318 	.exit = tcp4_proc_exit_net,
3319 };
3320 
3321 int __init tcp4_proc_init(void)
3322 {
3323 	return register_pernet_subsys(&tcp4_net_ops);
3324 }
3325 
3326 void tcp4_proc_exit(void)
3327 {
3328 	unregister_pernet_subsys(&tcp4_net_ops);
3329 }
3330 #endif /* CONFIG_PROC_FS */
3331 
3332 /* @wake is one when sk_stream_write_space() calls us.
3333  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3334  * This mimics the strategy used in sock_def_write_space().
3335  */
3336 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3337 {
3338 	const struct tcp_sock *tp = tcp_sk(sk);
3339 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3340 			    READ_ONCE(tp->snd_nxt);
3341 
3342 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3343 }
3344 EXPORT_SYMBOL(tcp_stream_memory_free);
3345 
3346 struct proto tcp_prot = {
3347 	.name			= "TCP",
3348 	.owner			= THIS_MODULE,
3349 	.close			= tcp_close,
3350 	.pre_connect		= tcp_v4_pre_connect,
3351 	.connect		= tcp_v4_connect,
3352 	.disconnect		= tcp_disconnect,
3353 	.accept			= inet_csk_accept,
3354 	.ioctl			= tcp_ioctl,
3355 	.init			= tcp_v4_init_sock,
3356 	.destroy		= tcp_v4_destroy_sock,
3357 	.shutdown		= tcp_shutdown,
3358 	.setsockopt		= tcp_setsockopt,
3359 	.getsockopt		= tcp_getsockopt,
3360 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3361 	.keepalive		= tcp_set_keepalive,
3362 	.recvmsg		= tcp_recvmsg,
3363 	.sendmsg		= tcp_sendmsg,
3364 	.splice_eof		= tcp_splice_eof,
3365 	.backlog_rcv		= tcp_v4_do_rcv,
3366 	.release_cb		= tcp_release_cb,
3367 	.hash			= inet_hash,
3368 	.unhash			= inet_unhash,
3369 	.get_port		= inet_csk_get_port,
3370 	.put_port		= inet_put_port,
3371 #ifdef CONFIG_BPF_SYSCALL
3372 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3373 #endif
3374 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3375 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3376 	.stream_memory_free	= tcp_stream_memory_free,
3377 	.sockets_allocated	= &tcp_sockets_allocated,
3378 	.orphan_count		= &tcp_orphan_count,
3379 
3380 	.memory_allocated	= &tcp_memory_allocated,
3381 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3382 
3383 	.memory_pressure	= &tcp_memory_pressure,
3384 	.sysctl_mem		= sysctl_tcp_mem,
3385 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3386 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3387 	.max_header		= MAX_TCP_HEADER,
3388 	.obj_size		= sizeof(struct tcp_sock),
3389 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3390 	.twsk_prot		= &tcp_timewait_sock_ops,
3391 	.rsk_prot		= &tcp_request_sock_ops,
3392 	.h.hashinfo		= NULL,
3393 	.no_autobind		= true,
3394 	.diag_destroy		= tcp_abort,
3395 };
3396 EXPORT_SYMBOL(tcp_prot);
3397 
3398 static void __net_exit tcp_sk_exit(struct net *net)
3399 {
3400 	if (net->ipv4.tcp_congestion_control)
3401 		bpf_module_put(net->ipv4.tcp_congestion_control,
3402 			       net->ipv4.tcp_congestion_control->owner);
3403 }
3404 
3405 static void __net_init tcp_set_hashinfo(struct net *net)
3406 {
3407 	struct inet_hashinfo *hinfo;
3408 	unsigned int ehash_entries;
3409 	struct net *old_net;
3410 
3411 	if (net_eq(net, &init_net))
3412 		goto fallback;
3413 
3414 	old_net = current->nsproxy->net_ns;
3415 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3416 	if (!ehash_entries)
3417 		goto fallback;
3418 
3419 	ehash_entries = roundup_pow_of_two(ehash_entries);
3420 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3421 	if (!hinfo) {
3422 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3423 			"for a netns, fallback to the global one\n",
3424 			ehash_entries);
3425 fallback:
3426 		hinfo = &tcp_hashinfo;
3427 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3428 	}
3429 
3430 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3431 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3432 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3433 }
3434 
3435 static int __net_init tcp_sk_init(struct net *net)
3436 {
3437 	net->ipv4.sysctl_tcp_ecn = 2;
3438 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3439 
3440 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3441 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3442 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3443 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3444 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3445 
3446 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3447 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3448 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3449 
3450 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3451 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3452 	net->ipv4.sysctl_tcp_syncookies = 1;
3453 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3454 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3455 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3456 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3457 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3458 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3459 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3460 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3461 
3462 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3463 	tcp_set_hashinfo(net);
3464 
3465 	net->ipv4.sysctl_tcp_sack = 1;
3466 	net->ipv4.sysctl_tcp_window_scaling = 1;
3467 	net->ipv4.sysctl_tcp_timestamps = 1;
3468 	net->ipv4.sysctl_tcp_early_retrans = 3;
3469 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3470 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3471 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3472 	net->ipv4.sysctl_tcp_max_reordering = 300;
3473 	net->ipv4.sysctl_tcp_dsack = 1;
3474 	net->ipv4.sysctl_tcp_app_win = 31;
3475 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3476 	net->ipv4.sysctl_tcp_frto = 2;
3477 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3478 	/* This limits the percentage of the congestion window which we
3479 	 * will allow a single TSO frame to consume.  Building TSO frames
3480 	 * which are too large can cause TCP streams to be bursty.
3481 	 */
3482 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3483 	/* Default TSQ limit of 16 TSO segments */
3484 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3485 
3486 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3487 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3488 
3489 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3490 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3491 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3492 	net->ipv4.sysctl_tcp_autocorking = 1;
3493 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3494 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3495 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3496 	if (net != &init_net) {
3497 		memcpy(net->ipv4.sysctl_tcp_rmem,
3498 		       init_net.ipv4.sysctl_tcp_rmem,
3499 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3500 		memcpy(net->ipv4.sysctl_tcp_wmem,
3501 		       init_net.ipv4.sysctl_tcp_wmem,
3502 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3503 	}
3504 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3505 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3506 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3507 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3508 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3509 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3510 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3511 
3512 	/* Set default values for PLB */
3513 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3514 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3515 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3516 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3517 	/* Default congestion threshold for PLB to mark a round is 50% */
3518 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3519 
3520 	/* Reno is always built in */
3521 	if (!net_eq(net, &init_net) &&
3522 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3523 			       init_net.ipv4.tcp_congestion_control->owner))
3524 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3525 	else
3526 		net->ipv4.tcp_congestion_control = &tcp_reno;
3527 
3528 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3529 	net->ipv4.sysctl_tcp_shrink_window = 0;
3530 
3531 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3532 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3533 
3534 	return 0;
3535 }
3536 
3537 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3538 {
3539 	struct net *net;
3540 
3541 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3542 	 * and failed setup_net error unwinding path are serialized.
3543 	 *
3544 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3545 	 * net_exit_list, the thread that dismantles a particular twsk must
3546 	 * do so without other thread progressing to refcount_dec_and_test() of
3547 	 * tcp_death_row.tw_refcount.
3548 	 */
3549 	mutex_lock(&tcp_exit_batch_mutex);
3550 
3551 	tcp_twsk_purge(net_exit_list);
3552 
3553 	list_for_each_entry(net, net_exit_list, exit_list) {
3554 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3555 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3556 		tcp_fastopen_ctx_destroy(net);
3557 	}
3558 
3559 	mutex_unlock(&tcp_exit_batch_mutex);
3560 }
3561 
3562 static struct pernet_operations __net_initdata tcp_sk_ops = {
3563        .init	   = tcp_sk_init,
3564        .exit	   = tcp_sk_exit,
3565        .exit_batch = tcp_sk_exit_batch,
3566 };
3567 
3568 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3569 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3570 		     struct sock_common *sk_common, uid_t uid)
3571 
3572 #define INIT_BATCH_SZ 16
3573 
3574 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3575 {
3576 	struct bpf_tcp_iter_state *iter = priv_data;
3577 	int err;
3578 
3579 	err = bpf_iter_init_seq_net(priv_data, aux);
3580 	if (err)
3581 		return err;
3582 
3583 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3584 	if (err) {
3585 		bpf_iter_fini_seq_net(priv_data);
3586 		return err;
3587 	}
3588 
3589 	return 0;
3590 }
3591 
3592 static void bpf_iter_fini_tcp(void *priv_data)
3593 {
3594 	struct bpf_tcp_iter_state *iter = priv_data;
3595 
3596 	bpf_iter_fini_seq_net(priv_data);
3597 	kvfree(iter->batch);
3598 }
3599 
3600 static const struct bpf_iter_seq_info tcp_seq_info = {
3601 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3602 	.init_seq_private	= bpf_iter_init_tcp,
3603 	.fini_seq_private	= bpf_iter_fini_tcp,
3604 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3605 };
3606 
3607 static const struct bpf_func_proto *
3608 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3609 			    const struct bpf_prog *prog)
3610 {
3611 	switch (func_id) {
3612 	case BPF_FUNC_setsockopt:
3613 		return &bpf_sk_setsockopt_proto;
3614 	case BPF_FUNC_getsockopt:
3615 		return &bpf_sk_getsockopt_proto;
3616 	default:
3617 		return NULL;
3618 	}
3619 }
3620 
3621 static struct bpf_iter_reg tcp_reg_info = {
3622 	.target			= "tcp",
3623 	.ctx_arg_info_size	= 1,
3624 	.ctx_arg_info		= {
3625 		{ offsetof(struct bpf_iter__tcp, sk_common),
3626 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3627 	},
3628 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3629 	.seq_info		= &tcp_seq_info,
3630 };
3631 
3632 static void __init bpf_iter_register(void)
3633 {
3634 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3635 	if (bpf_iter_reg_target(&tcp_reg_info))
3636 		pr_warn("Warning: could not register bpf iterator tcp\n");
3637 }
3638 
3639 #endif
3640 
3641 void __init tcp_v4_init(void)
3642 {
3643 	int cpu, res;
3644 
3645 	for_each_possible_cpu(cpu) {
3646 		struct sock *sk;
3647 
3648 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3649 					   IPPROTO_TCP, &init_net);
3650 		if (res)
3651 			panic("Failed to create the TCP control socket.\n");
3652 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3653 
3654 		/* Please enforce IP_DF and IPID==0 for RST and
3655 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3656 		 */
3657 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3658 
3659 		sk->sk_clockid = CLOCK_MONOTONIC;
3660 
3661 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3662 	}
3663 	if (register_pernet_subsys(&tcp_sk_ops))
3664 		panic("Failed to create the TCP control socket.\n");
3665 
3666 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3667 	bpf_iter_register();
3668 #endif
3669 }
3670