xref: /linux/net/ipv4/tcp_ipv4.c (revision 9410645520e9b820069761f3450ef6661418e279)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74 
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 #include <linux/skbuff_ref.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 #include <trace/events/tcp.h>
88 
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93 
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96 
97 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
99 };
100 
101 static DEFINE_MUTEX(tcp_exit_batch_mutex);
102 
tcp_v4_init_seq(const struct sk_buff * skb)103 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
104 {
105 	return secure_tcp_seq(ip_hdr(skb)->daddr,
106 			      ip_hdr(skb)->saddr,
107 			      tcp_hdr(skb)->dest,
108 			      tcp_hdr(skb)->source);
109 }
110 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)111 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
112 {
113 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
114 }
115 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 {
118 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
119 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
120 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
121 	struct tcp_sock *tp = tcp_sk(sk);
122 	int ts_recent_stamp;
123 
124 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
125 		reuse = 0;
126 
127 	if (reuse == 2) {
128 		/* Still does not detect *everything* that goes through
129 		 * lo, since we require a loopback src or dst address
130 		 * or direct binding to 'lo' interface.
131 		 */
132 		bool loopback = false;
133 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
134 			loopback = true;
135 #if IS_ENABLED(CONFIG_IPV6)
136 		if (tw->tw_family == AF_INET6) {
137 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
138 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
139 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
140 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
141 				loopback = true;
142 		} else
143 #endif
144 		{
145 			if (ipv4_is_loopback(tw->tw_daddr) ||
146 			    ipv4_is_loopback(tw->tw_rcv_saddr))
147 				loopback = true;
148 		}
149 		if (!loopback)
150 			reuse = 0;
151 	}
152 
153 	/* With PAWS, it is safe from the viewpoint
154 	   of data integrity. Even without PAWS it is safe provided sequence
155 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
156 
157 	   Actually, the idea is close to VJ's one, only timestamp cache is
158 	   held not per host, but per port pair and TW bucket is used as state
159 	   holder.
160 
161 	   If TW bucket has been already destroyed we fall back to VJ's scheme
162 	   and use initial timestamp retrieved from peer table.
163 	 */
164 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
165 	if (ts_recent_stamp &&
166 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
167 					    ts_recent_stamp)))) {
168 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
169 		 * and releasing the bucket lock.
170 		 */
171 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
172 			return 0;
173 
174 		/* In case of repair and re-using TIME-WAIT sockets we still
175 		 * want to be sure that it is safe as above but honor the
176 		 * sequence numbers and time stamps set as part of the repair
177 		 * process.
178 		 *
179 		 * Without this check re-using a TIME-WAIT socket with TCP
180 		 * repair would accumulate a -1 on the repair assigned
181 		 * sequence number. The first time it is reused the sequence
182 		 * is -1, the second time -2, etc. This fixes that issue
183 		 * without appearing to create any others.
184 		 */
185 		if (likely(!tp->repair)) {
186 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
187 
188 			if (!seq)
189 				seq = 1;
190 			WRITE_ONCE(tp->write_seq, seq);
191 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
192 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
193 		}
194 
195 		return 1;
196 	}
197 
198 	return 0;
199 }
200 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
201 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)202 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
203 			      int addr_len)
204 {
205 	/* This check is replicated from tcp_v4_connect() and intended to
206 	 * prevent BPF program called below from accessing bytes that are out
207 	 * of the bound specified by user in addr_len.
208 	 */
209 	if (addr_len < sizeof(struct sockaddr_in))
210 		return -EINVAL;
211 
212 	sock_owned_by_me(sk);
213 
214 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
215 }
216 
217 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)218 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
219 {
220 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
221 	struct inet_timewait_death_row *tcp_death_row;
222 	struct inet_sock *inet = inet_sk(sk);
223 	struct tcp_sock *tp = tcp_sk(sk);
224 	struct ip_options_rcu *inet_opt;
225 	struct net *net = sock_net(sk);
226 	__be16 orig_sport, orig_dport;
227 	__be32 daddr, nexthop;
228 	struct flowi4 *fl4;
229 	struct rtable *rt;
230 	int err;
231 
232 	if (addr_len < sizeof(struct sockaddr_in))
233 		return -EINVAL;
234 
235 	if (usin->sin_family != AF_INET)
236 		return -EAFNOSUPPORT;
237 
238 	nexthop = daddr = usin->sin_addr.s_addr;
239 	inet_opt = rcu_dereference_protected(inet->inet_opt,
240 					     lockdep_sock_is_held(sk));
241 	if (inet_opt && inet_opt->opt.srr) {
242 		if (!daddr)
243 			return -EINVAL;
244 		nexthop = inet_opt->opt.faddr;
245 	}
246 
247 	orig_sport = inet->inet_sport;
248 	orig_dport = usin->sin_port;
249 	fl4 = &inet->cork.fl.u.ip4;
250 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
251 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
252 			      orig_dport, sk);
253 	if (IS_ERR(rt)) {
254 		err = PTR_ERR(rt);
255 		if (err == -ENETUNREACH)
256 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
257 		return err;
258 	}
259 
260 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
261 		ip_rt_put(rt);
262 		return -ENETUNREACH;
263 	}
264 
265 	if (!inet_opt || !inet_opt->opt.srr)
266 		daddr = fl4->daddr;
267 
268 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
269 
270 	if (!inet->inet_saddr) {
271 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
272 		if (err) {
273 			ip_rt_put(rt);
274 			return err;
275 		}
276 	} else {
277 		sk_rcv_saddr_set(sk, inet->inet_saddr);
278 	}
279 
280 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
281 		/* Reset inherited state */
282 		tp->rx_opt.ts_recent	   = 0;
283 		tp->rx_opt.ts_recent_stamp = 0;
284 		if (likely(!tp->repair))
285 			WRITE_ONCE(tp->write_seq, 0);
286 	}
287 
288 	inet->inet_dport = usin->sin_port;
289 	sk_daddr_set(sk, daddr);
290 
291 	inet_csk(sk)->icsk_ext_hdr_len = 0;
292 	if (inet_opt)
293 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
294 
295 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
296 
297 	/* Socket identity is still unknown (sport may be zero).
298 	 * However we set state to SYN-SENT and not releasing socket
299 	 * lock select source port, enter ourselves into the hash tables and
300 	 * complete initialization after this.
301 	 */
302 	tcp_set_state(sk, TCP_SYN_SENT);
303 	err = inet_hash_connect(tcp_death_row, sk);
304 	if (err)
305 		goto failure;
306 
307 	sk_set_txhash(sk);
308 
309 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
310 			       inet->inet_sport, inet->inet_dport, sk);
311 	if (IS_ERR(rt)) {
312 		err = PTR_ERR(rt);
313 		rt = NULL;
314 		goto failure;
315 	}
316 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
317 	/* OK, now commit destination to socket.  */
318 	sk->sk_gso_type = SKB_GSO_TCPV4;
319 	sk_setup_caps(sk, &rt->dst);
320 	rt = NULL;
321 
322 	if (likely(!tp->repair)) {
323 		if (!tp->write_seq)
324 			WRITE_ONCE(tp->write_seq,
325 				   secure_tcp_seq(inet->inet_saddr,
326 						  inet->inet_daddr,
327 						  inet->inet_sport,
328 						  usin->sin_port));
329 		WRITE_ONCE(tp->tsoffset,
330 			   secure_tcp_ts_off(net, inet->inet_saddr,
331 					     inet->inet_daddr));
332 	}
333 
334 	atomic_set(&inet->inet_id, get_random_u16());
335 
336 	if (tcp_fastopen_defer_connect(sk, &err))
337 		return err;
338 	if (err)
339 		goto failure;
340 
341 	err = tcp_connect(sk);
342 
343 	if (err)
344 		goto failure;
345 
346 	return 0;
347 
348 failure:
349 	/*
350 	 * This unhashes the socket and releases the local port,
351 	 * if necessary.
352 	 */
353 	tcp_set_state(sk, TCP_CLOSE);
354 	inet_bhash2_reset_saddr(sk);
355 	ip_rt_put(rt);
356 	sk->sk_route_caps = 0;
357 	inet->inet_dport = 0;
358 	return err;
359 }
360 EXPORT_SYMBOL(tcp_v4_connect);
361 
362 /*
363  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
364  * It can be called through tcp_release_cb() if socket was owned by user
365  * at the time tcp_v4_err() was called to handle ICMP message.
366  */
tcp_v4_mtu_reduced(struct sock * sk)367 void tcp_v4_mtu_reduced(struct sock *sk)
368 {
369 	struct inet_sock *inet = inet_sk(sk);
370 	struct dst_entry *dst;
371 	u32 mtu;
372 
373 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
374 		return;
375 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
376 	dst = inet_csk_update_pmtu(sk, mtu);
377 	if (!dst)
378 		return;
379 
380 	/* Something is about to be wrong... Remember soft error
381 	 * for the case, if this connection will not able to recover.
382 	 */
383 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
384 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
385 
386 	mtu = dst_mtu(dst);
387 
388 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
389 	    ip_sk_accept_pmtu(sk) &&
390 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
391 		tcp_sync_mss(sk, mtu);
392 
393 		/* Resend the TCP packet because it's
394 		 * clear that the old packet has been
395 		 * dropped. This is the new "fast" path mtu
396 		 * discovery.
397 		 */
398 		tcp_simple_retransmit(sk);
399 	} /* else let the usual retransmit timer handle it */
400 }
401 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
402 
do_redirect(struct sk_buff * skb,struct sock * sk)403 static void do_redirect(struct sk_buff *skb, struct sock *sk)
404 {
405 	struct dst_entry *dst = __sk_dst_check(sk, 0);
406 
407 	if (dst)
408 		dst->ops->redirect(dst, sk, skb);
409 }
410 
411 
412 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)413 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
414 {
415 	struct request_sock *req = inet_reqsk(sk);
416 	struct net *net = sock_net(sk);
417 
418 	/* ICMPs are not backlogged, hence we cannot get
419 	 * an established socket here.
420 	 */
421 	if (seq != tcp_rsk(req)->snt_isn) {
422 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
423 	} else if (abort) {
424 		/*
425 		 * Still in SYN_RECV, just remove it silently.
426 		 * There is no good way to pass the error to the newly
427 		 * created socket, and POSIX does not want network
428 		 * errors returned from accept().
429 		 */
430 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
431 		tcp_listendrop(req->rsk_listener);
432 	}
433 	reqsk_put(req);
434 }
435 EXPORT_SYMBOL(tcp_req_err);
436 
437 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)438 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
439 {
440 	struct inet_connection_sock *icsk = inet_csk(sk);
441 	struct tcp_sock *tp = tcp_sk(sk);
442 	struct sk_buff *skb;
443 	s32 remaining;
444 	u32 delta_us;
445 
446 	if (sock_owned_by_user(sk))
447 		return;
448 
449 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
450 	    !icsk->icsk_backoff)
451 		return;
452 
453 	skb = tcp_rtx_queue_head(sk);
454 	if (WARN_ON_ONCE(!skb))
455 		return;
456 
457 	icsk->icsk_backoff--;
458 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
459 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
460 
461 	tcp_mstamp_refresh(tp);
462 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
463 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
464 
465 	if (remaining > 0) {
466 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
467 					  remaining, TCP_RTO_MAX);
468 	} else {
469 		/* RTO revert clocked out retransmission.
470 		 * Will retransmit now.
471 		 */
472 		tcp_retransmit_timer(sk);
473 	}
474 }
475 EXPORT_SYMBOL(tcp_ld_RTO_revert);
476 
477 /*
478  * This routine is called by the ICMP module when it gets some
479  * sort of error condition.  If err < 0 then the socket should
480  * be closed and the error returned to the user.  If err > 0
481  * it's just the icmp type << 8 | icmp code.  After adjustment
482  * header points to the first 8 bytes of the tcp header.  We need
483  * to find the appropriate port.
484  *
485  * The locking strategy used here is very "optimistic". When
486  * someone else accesses the socket the ICMP is just dropped
487  * and for some paths there is no check at all.
488  * A more general error queue to queue errors for later handling
489  * is probably better.
490  *
491  */
492 
tcp_v4_err(struct sk_buff * skb,u32 info)493 int tcp_v4_err(struct sk_buff *skb, u32 info)
494 {
495 	const struct iphdr *iph = (const struct iphdr *)skb->data;
496 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
497 	struct tcp_sock *tp;
498 	const int type = icmp_hdr(skb)->type;
499 	const int code = icmp_hdr(skb)->code;
500 	struct sock *sk;
501 	struct request_sock *fastopen;
502 	u32 seq, snd_una;
503 	int err;
504 	struct net *net = dev_net(skb->dev);
505 
506 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
507 				       iph->daddr, th->dest, iph->saddr,
508 				       ntohs(th->source), inet_iif(skb), 0);
509 	if (!sk) {
510 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
511 		return -ENOENT;
512 	}
513 	if (sk->sk_state == TCP_TIME_WAIT) {
514 		/* To increase the counter of ignored icmps for TCP-AO */
515 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
516 		inet_twsk_put(inet_twsk(sk));
517 		return 0;
518 	}
519 	seq = ntohl(th->seq);
520 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
521 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
522 				     type == ICMP_TIME_EXCEEDED ||
523 				     (type == ICMP_DEST_UNREACH &&
524 				      (code == ICMP_NET_UNREACH ||
525 				       code == ICMP_HOST_UNREACH)));
526 		return 0;
527 	}
528 
529 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
530 		sock_put(sk);
531 		return 0;
532 	}
533 
534 	bh_lock_sock(sk);
535 	/* If too many ICMPs get dropped on busy
536 	 * servers this needs to be solved differently.
537 	 * We do take care of PMTU discovery (RFC1191) special case :
538 	 * we can receive locally generated ICMP messages while socket is held.
539 	 */
540 	if (sock_owned_by_user(sk)) {
541 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
542 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
543 	}
544 	if (sk->sk_state == TCP_CLOSE)
545 		goto out;
546 
547 	if (static_branch_unlikely(&ip4_min_ttl)) {
548 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
549 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
550 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
551 			goto out;
552 		}
553 	}
554 
555 	tp = tcp_sk(sk);
556 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
557 	fastopen = rcu_dereference(tp->fastopen_rsk);
558 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
559 	if (sk->sk_state != TCP_LISTEN &&
560 	    !between(seq, snd_una, tp->snd_nxt)) {
561 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
562 		goto out;
563 	}
564 
565 	switch (type) {
566 	case ICMP_REDIRECT:
567 		if (!sock_owned_by_user(sk))
568 			do_redirect(skb, sk);
569 		goto out;
570 	case ICMP_SOURCE_QUENCH:
571 		/* Just silently ignore these. */
572 		goto out;
573 	case ICMP_PARAMETERPROB:
574 		err = EPROTO;
575 		break;
576 	case ICMP_DEST_UNREACH:
577 		if (code > NR_ICMP_UNREACH)
578 			goto out;
579 
580 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
581 			/* We are not interested in TCP_LISTEN and open_requests
582 			 * (SYN-ACKs send out by Linux are always <576bytes so
583 			 * they should go through unfragmented).
584 			 */
585 			if (sk->sk_state == TCP_LISTEN)
586 				goto out;
587 
588 			WRITE_ONCE(tp->mtu_info, info);
589 			if (!sock_owned_by_user(sk)) {
590 				tcp_v4_mtu_reduced(sk);
591 			} else {
592 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
593 					sock_hold(sk);
594 			}
595 			goto out;
596 		}
597 
598 		err = icmp_err_convert[code].errno;
599 		/* check if this ICMP message allows revert of backoff.
600 		 * (see RFC 6069)
601 		 */
602 		if (!fastopen &&
603 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
604 			tcp_ld_RTO_revert(sk, seq);
605 		break;
606 	case ICMP_TIME_EXCEEDED:
607 		err = EHOSTUNREACH;
608 		break;
609 	default:
610 		goto out;
611 	}
612 
613 	switch (sk->sk_state) {
614 	case TCP_SYN_SENT:
615 	case TCP_SYN_RECV:
616 		/* Only in fast or simultaneous open. If a fast open socket is
617 		 * already accepted it is treated as a connected one below.
618 		 */
619 		if (fastopen && !fastopen->sk)
620 			break;
621 
622 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
623 
624 		if (!sock_owned_by_user(sk))
625 			tcp_done_with_error(sk, err);
626 		else
627 			WRITE_ONCE(sk->sk_err_soft, err);
628 		goto out;
629 	}
630 
631 	/* If we've already connected we will keep trying
632 	 * until we time out, or the user gives up.
633 	 *
634 	 * rfc1122 4.2.3.9 allows to consider as hard errors
635 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
636 	 * but it is obsoleted by pmtu discovery).
637 	 *
638 	 * Note, that in modern internet, where routing is unreliable
639 	 * and in each dark corner broken firewalls sit, sending random
640 	 * errors ordered by their masters even this two messages finally lose
641 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
642 	 *
643 	 * Now we are in compliance with RFCs.
644 	 *							--ANK (980905)
645 	 */
646 
647 	if (!sock_owned_by_user(sk) &&
648 	    inet_test_bit(RECVERR, sk)) {
649 		WRITE_ONCE(sk->sk_err, err);
650 		sk_error_report(sk);
651 	} else	{ /* Only an error on timeout */
652 		WRITE_ONCE(sk->sk_err_soft, err);
653 	}
654 
655 out:
656 	bh_unlock_sock(sk);
657 	sock_put(sk);
658 	return 0;
659 }
660 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)661 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
662 {
663 	struct tcphdr *th = tcp_hdr(skb);
664 
665 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
666 	skb->csum_start = skb_transport_header(skb) - skb->head;
667 	skb->csum_offset = offsetof(struct tcphdr, check);
668 }
669 
670 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)671 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
672 {
673 	const struct inet_sock *inet = inet_sk(sk);
674 
675 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
676 }
677 EXPORT_SYMBOL(tcp_v4_send_check);
678 
679 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
680 
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])681 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
682 				 const struct tcp_ao_hdr *aoh,
683 				 struct ip_reply_arg *arg, struct tcphdr *reply,
684 				 __be32 reply_options[REPLY_OPTIONS_LEN])
685 {
686 #ifdef CONFIG_TCP_AO
687 	int sdif = tcp_v4_sdif(skb);
688 	int dif = inet_iif(skb);
689 	int l3index = sdif ? dif : 0;
690 	bool allocated_traffic_key;
691 	struct tcp_ao_key *key;
692 	char *traffic_key;
693 	bool drop = true;
694 	u32 ao_sne = 0;
695 	u8 keyid;
696 
697 	rcu_read_lock();
698 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
699 				 &key, &traffic_key, &allocated_traffic_key,
700 				 &keyid, &ao_sne))
701 		goto out;
702 
703 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
704 				 (aoh->rnext_keyid << 8) | keyid);
705 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
706 	reply->doff = arg->iov[0].iov_len / 4;
707 
708 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
709 			    key, traffic_key,
710 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
711 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
712 			    reply, ao_sne))
713 		goto out;
714 	drop = false;
715 out:
716 	rcu_read_unlock();
717 	if (allocated_traffic_key)
718 		kfree(traffic_key);
719 	return drop;
720 #else
721 	return true;
722 #endif
723 }
724 
725 /*
726  *	This routine will send an RST to the other tcp.
727  *
728  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
729  *		      for reset.
730  *	Answer: if a packet caused RST, it is not for a socket
731  *		existing in our system, if it is matched to a socket,
732  *		it is just duplicate segment or bug in other side's TCP.
733  *		So that we build reply only basing on parameters
734  *		arrived with segment.
735  *	Exception: precedence violation. We do not implement it in any case.
736  */
737 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)738 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
739 			      enum sk_rst_reason reason)
740 {
741 	const struct tcphdr *th = tcp_hdr(skb);
742 	struct {
743 		struct tcphdr th;
744 		__be32 opt[REPLY_OPTIONS_LEN];
745 	} rep;
746 	const __u8 *md5_hash_location = NULL;
747 	const struct tcp_ao_hdr *aoh;
748 	struct ip_reply_arg arg;
749 #ifdef CONFIG_TCP_MD5SIG
750 	struct tcp_md5sig_key *key = NULL;
751 	unsigned char newhash[16];
752 	struct sock *sk1 = NULL;
753 	int genhash;
754 #endif
755 	u64 transmit_time = 0;
756 	struct sock *ctl_sk;
757 	struct net *net;
758 	u32 txhash = 0;
759 
760 	/* Never send a reset in response to a reset. */
761 	if (th->rst)
762 		return;
763 
764 	/* If sk not NULL, it means we did a successful lookup and incoming
765 	 * route had to be correct. prequeue might have dropped our dst.
766 	 */
767 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
768 		return;
769 
770 	/* Swap the send and the receive. */
771 	memset(&rep, 0, sizeof(rep));
772 	rep.th.dest   = th->source;
773 	rep.th.source = th->dest;
774 	rep.th.doff   = sizeof(struct tcphdr) / 4;
775 	rep.th.rst    = 1;
776 
777 	if (th->ack) {
778 		rep.th.seq = th->ack_seq;
779 	} else {
780 		rep.th.ack = 1;
781 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
782 				       skb->len - (th->doff << 2));
783 	}
784 
785 	memset(&arg, 0, sizeof(arg));
786 	arg.iov[0].iov_base = (unsigned char *)&rep;
787 	arg.iov[0].iov_len  = sizeof(rep.th);
788 
789 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
790 
791 	/* Invalid TCP option size or twice included auth */
792 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
793 		return;
794 
795 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
796 		return;
797 
798 #ifdef CONFIG_TCP_MD5SIG
799 	rcu_read_lock();
800 	if (sk && sk_fullsock(sk)) {
801 		const union tcp_md5_addr *addr;
802 		int l3index;
803 
804 		/* sdif set, means packet ingressed via a device
805 		 * in an L3 domain and inet_iif is set to it.
806 		 */
807 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
808 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
809 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
810 	} else if (md5_hash_location) {
811 		const union tcp_md5_addr *addr;
812 		int sdif = tcp_v4_sdif(skb);
813 		int dif = inet_iif(skb);
814 		int l3index;
815 
816 		/*
817 		 * active side is lost. Try to find listening socket through
818 		 * source port, and then find md5 key through listening socket.
819 		 * we are not loose security here:
820 		 * Incoming packet is checked with md5 hash with finding key,
821 		 * no RST generated if md5 hash doesn't match.
822 		 */
823 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
824 					     NULL, 0, ip_hdr(skb)->saddr,
825 					     th->source, ip_hdr(skb)->daddr,
826 					     ntohs(th->source), dif, sdif);
827 		/* don't send rst if it can't find key */
828 		if (!sk1)
829 			goto out;
830 
831 		/* sdif set, means packet ingressed via a device
832 		 * in an L3 domain and dif is set to it.
833 		 */
834 		l3index = sdif ? dif : 0;
835 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
836 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
837 		if (!key)
838 			goto out;
839 
840 
841 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
842 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
843 			goto out;
844 
845 	}
846 
847 	if (key) {
848 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
849 				   (TCPOPT_NOP << 16) |
850 				   (TCPOPT_MD5SIG << 8) |
851 				   TCPOLEN_MD5SIG);
852 		/* Update length and the length the header thinks exists */
853 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
854 		rep.th.doff = arg.iov[0].iov_len / 4;
855 
856 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
857 				     key, ip_hdr(skb)->saddr,
858 				     ip_hdr(skb)->daddr, &rep.th);
859 	}
860 #endif
861 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
862 	if (rep.opt[0] == 0) {
863 		__be32 mrst = mptcp_reset_option(skb);
864 
865 		if (mrst) {
866 			rep.opt[0] = mrst;
867 			arg.iov[0].iov_len += sizeof(mrst);
868 			rep.th.doff = arg.iov[0].iov_len / 4;
869 		}
870 	}
871 
872 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
873 				      ip_hdr(skb)->saddr, /* XXX */
874 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
875 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
876 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
877 
878 	/* When socket is gone, all binding information is lost.
879 	 * routing might fail in this case. No choice here, if we choose to force
880 	 * input interface, we will misroute in case of asymmetric route.
881 	 */
882 	if (sk)
883 		arg.bound_dev_if = sk->sk_bound_dev_if;
884 
885 	trace_tcp_send_reset(sk, skb, reason);
886 
887 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
888 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
889 
890 	arg.tos = ip_hdr(skb)->tos;
891 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
894 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
895 
896 	sock_net_set(ctl_sk, net);
897 	if (sk) {
898 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
899 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
900 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
901 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
902 		transmit_time = tcp_transmit_time(sk);
903 		xfrm_sk_clone_policy(ctl_sk, sk);
904 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
905 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
906 	} else {
907 		ctl_sk->sk_mark = 0;
908 		ctl_sk->sk_priority = 0;
909 	}
910 	ip_send_unicast_reply(ctl_sk,
911 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
912 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913 			      &arg, arg.iov[0].iov_len,
914 			      transmit_time, txhash);
915 
916 	xfrm_sk_free_policy(ctl_sk);
917 	sock_net_set(ctl_sk, &init_net);
918 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
919 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
920 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
921 	local_bh_enable();
922 
923 #ifdef CONFIG_TCP_MD5SIG
924 out:
925 	rcu_read_unlock();
926 #endif
927 }
928 
929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
930    outside socket context is ugly, certainly. What can I do?
931  */
932 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)933 static void tcp_v4_send_ack(const struct sock *sk,
934 			    struct sk_buff *skb, u32 seq, u32 ack,
935 			    u32 win, u32 tsval, u32 tsecr, int oif,
936 			    struct tcp_key *key,
937 			    int reply_flags, u8 tos, u32 txhash)
938 {
939 	const struct tcphdr *th = tcp_hdr(skb);
940 	struct {
941 		struct tcphdr th;
942 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
943 	} rep;
944 	struct net *net = sock_net(sk);
945 	struct ip_reply_arg arg;
946 	struct sock *ctl_sk;
947 	u64 transmit_time;
948 
949 	memset(&rep.th, 0, sizeof(struct tcphdr));
950 	memset(&arg, 0, sizeof(arg));
951 
952 	arg.iov[0].iov_base = (unsigned char *)&rep;
953 	arg.iov[0].iov_len  = sizeof(rep.th);
954 	if (tsecr) {
955 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
956 				   (TCPOPT_TIMESTAMP << 8) |
957 				   TCPOLEN_TIMESTAMP);
958 		rep.opt[1] = htonl(tsval);
959 		rep.opt[2] = htonl(tsecr);
960 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
961 	}
962 
963 	/* Swap the send and the receive. */
964 	rep.th.dest    = th->source;
965 	rep.th.source  = th->dest;
966 	rep.th.doff    = arg.iov[0].iov_len / 4;
967 	rep.th.seq     = htonl(seq);
968 	rep.th.ack_seq = htonl(ack);
969 	rep.th.ack     = 1;
970 	rep.th.window  = htons(win);
971 
972 #ifdef CONFIG_TCP_MD5SIG
973 	if (tcp_key_is_md5(key)) {
974 		int offset = (tsecr) ? 3 : 0;
975 
976 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
977 					  (TCPOPT_NOP << 16) |
978 					  (TCPOPT_MD5SIG << 8) |
979 					  TCPOLEN_MD5SIG);
980 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
981 		rep.th.doff = arg.iov[0].iov_len/4;
982 
983 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
984 				    key->md5_key, ip_hdr(skb)->saddr,
985 				    ip_hdr(skb)->daddr, &rep.th);
986 	}
987 #endif
988 #ifdef CONFIG_TCP_AO
989 	if (tcp_key_is_ao(key)) {
990 		int offset = (tsecr) ? 3 : 0;
991 
992 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
993 					  (tcp_ao_len(key->ao_key) << 16) |
994 					  (key->ao_key->sndid << 8) |
995 					  key->rcv_next);
996 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
997 		rep.th.doff = arg.iov[0].iov_len / 4;
998 
999 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1000 				key->ao_key, key->traffic_key,
1001 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1002 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1003 				&rep.th, key->sne);
1004 	}
1005 #endif
1006 	arg.flags = reply_flags;
1007 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1008 				      ip_hdr(skb)->saddr, /* XXX */
1009 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1010 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1011 	if (oif)
1012 		arg.bound_dev_if = oif;
1013 	arg.tos = tos;
1014 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1015 	local_bh_disable();
1016 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1017 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1018 	sock_net_set(ctl_sk, net);
1019 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1020 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1021 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1022 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1023 	transmit_time = tcp_transmit_time(sk);
1024 	ip_send_unicast_reply(ctl_sk,
1025 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1026 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1027 			      &arg, arg.iov[0].iov_len,
1028 			      transmit_time, txhash);
1029 
1030 	sock_net_set(ctl_sk, &init_net);
1031 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1032 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1033 	local_bh_enable();
1034 }
1035 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1037 {
1038 	struct inet_timewait_sock *tw = inet_twsk(sk);
1039 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1040 	struct tcp_key key = {};
1041 #ifdef CONFIG_TCP_AO
1042 	struct tcp_ao_info *ao_info;
1043 
1044 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1045 		/* FIXME: the segment to-be-acked is not verified yet */
1046 		ao_info = rcu_dereference(tcptw->ao_info);
1047 		if (ao_info) {
1048 			const struct tcp_ao_hdr *aoh;
1049 
1050 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1051 				inet_twsk_put(tw);
1052 				return;
1053 			}
1054 
1055 			if (aoh)
1056 				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1057 		}
1058 	}
1059 	if (key.ao_key) {
1060 		struct tcp_ao_key *rnext_key;
1061 
1062 		key.traffic_key = snd_other_key(key.ao_key);
1063 		key.sne = READ_ONCE(ao_info->snd_sne);
1064 		rnext_key = READ_ONCE(ao_info->rnext_key);
1065 		key.rcv_next = rnext_key->rcvid;
1066 		key.type = TCP_KEY_AO;
1067 #else
1068 	if (0) {
1069 #endif
1070 	} else if (static_branch_tcp_md5()) {
1071 		key.md5_key = tcp_twsk_md5_key(tcptw);
1072 		if (key.md5_key)
1073 			key.type = TCP_KEY_MD5;
1074 	}
1075 
1076 	tcp_v4_send_ack(sk, skb,
1077 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1078 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1079 			tcp_tw_tsval(tcptw),
1080 			READ_ONCE(tcptw->tw_ts_recent),
1081 			tw->tw_bound_dev_if, &key,
1082 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1083 			tw->tw_tos,
1084 			tw->tw_txhash);
1085 
1086 	inet_twsk_put(tw);
1087 }
1088 
1089 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1090 				  struct request_sock *req)
1091 {
1092 	struct tcp_key key = {};
1093 
1094 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1095 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1096 	 */
1097 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1098 					     tcp_sk(sk)->snd_nxt;
1099 
1100 #ifdef CONFIG_TCP_AO
1101 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1102 	    tcp_rsk_used_ao(req)) {
1103 		const union tcp_md5_addr *addr;
1104 		const struct tcp_ao_hdr *aoh;
1105 		int l3index;
1106 
1107 		/* Invalid TCP option size or twice included auth */
1108 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1109 			return;
1110 		if (!aoh)
1111 			return;
1112 
1113 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1114 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1115 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1116 					      aoh->rnext_keyid, -1);
1117 		if (unlikely(!key.ao_key)) {
1118 			/* Send ACK with any matching MKT for the peer */
1119 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1120 			/* Matching key disappeared (user removed the key?)
1121 			 * let the handshake timeout.
1122 			 */
1123 			if (!key.ao_key) {
1124 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1125 						     addr,
1126 						     ntohs(tcp_hdr(skb)->source),
1127 						     &ip_hdr(skb)->daddr,
1128 						     ntohs(tcp_hdr(skb)->dest));
1129 				return;
1130 			}
1131 		}
1132 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1133 		if (!key.traffic_key)
1134 			return;
1135 
1136 		key.type = TCP_KEY_AO;
1137 		key.rcv_next = aoh->keyid;
1138 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1139 #else
1140 	if (0) {
1141 #endif
1142 	} else if (static_branch_tcp_md5()) {
1143 		const union tcp_md5_addr *addr;
1144 		int l3index;
1145 
1146 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1147 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1148 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1149 		if (key.md5_key)
1150 			key.type = TCP_KEY_MD5;
1151 	}
1152 
1153 	tcp_v4_send_ack(sk, skb, seq,
1154 			tcp_rsk(req)->rcv_nxt,
1155 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1156 			tcp_rsk_tsval(tcp_rsk(req)),
1157 			READ_ONCE(req->ts_recent),
1158 			0, &key,
1159 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1160 			ip_hdr(skb)->tos,
1161 			READ_ONCE(tcp_rsk(req)->txhash));
1162 	if (tcp_key_is_ao(&key))
1163 		kfree(key.traffic_key);
1164 }
1165 
1166 /*
1167  *	Send a SYN-ACK after having received a SYN.
1168  *	This still operates on a request_sock only, not on a big
1169  *	socket.
1170  */
1171 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1172 			      struct flowi *fl,
1173 			      struct request_sock *req,
1174 			      struct tcp_fastopen_cookie *foc,
1175 			      enum tcp_synack_type synack_type,
1176 			      struct sk_buff *syn_skb)
1177 {
1178 	const struct inet_request_sock *ireq = inet_rsk(req);
1179 	struct flowi4 fl4;
1180 	int err = -1;
1181 	struct sk_buff *skb;
1182 	u8 tos;
1183 
1184 	/* First, grab a route. */
1185 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1186 		return -1;
1187 
1188 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1189 
1190 	if (skb) {
1191 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1192 
1193 		tos = READ_ONCE(inet_sk(sk)->tos);
1194 
1195 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1196 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1197 			      (tos & INET_ECN_MASK);
1198 
1199 		if (!INET_ECN_is_capable(tos) &&
1200 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1201 			tos |= INET_ECN_ECT_0;
1202 
1203 		rcu_read_lock();
1204 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1205 					    ireq->ir_rmt_addr,
1206 					    rcu_dereference(ireq->ireq_opt),
1207 					    tos);
1208 		rcu_read_unlock();
1209 		err = net_xmit_eval(err);
1210 	}
1211 
1212 	return err;
1213 }
1214 
1215 /*
1216  *	IPv4 request_sock destructor.
1217  */
1218 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1219 {
1220 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1221 }
1222 
1223 #ifdef CONFIG_TCP_MD5SIG
1224 /*
1225  * RFC2385 MD5 checksumming requires a mapping of
1226  * IP address->MD5 Key.
1227  * We need to maintain these in the sk structure.
1228  */
1229 
1230 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1231 EXPORT_SYMBOL(tcp_md5_needed);
1232 
1233 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1234 {
1235 	if (!old)
1236 		return true;
1237 
1238 	/* l3index always overrides non-l3index */
1239 	if (old->l3index && new->l3index == 0)
1240 		return false;
1241 	if (old->l3index == 0 && new->l3index)
1242 		return true;
1243 
1244 	return old->prefixlen < new->prefixlen;
1245 }
1246 
1247 /* Find the Key structure for an address.  */
1248 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1249 					   const union tcp_md5_addr *addr,
1250 					   int family, bool any_l3index)
1251 {
1252 	const struct tcp_sock *tp = tcp_sk(sk);
1253 	struct tcp_md5sig_key *key;
1254 	const struct tcp_md5sig_info *md5sig;
1255 	__be32 mask;
1256 	struct tcp_md5sig_key *best_match = NULL;
1257 	bool match;
1258 
1259 	/* caller either holds rcu_read_lock() or socket lock */
1260 	md5sig = rcu_dereference_check(tp->md5sig_info,
1261 				       lockdep_sock_is_held(sk));
1262 	if (!md5sig)
1263 		return NULL;
1264 
1265 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1266 				 lockdep_sock_is_held(sk)) {
1267 		if (key->family != family)
1268 			continue;
1269 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1270 		    key->l3index != l3index)
1271 			continue;
1272 		if (family == AF_INET) {
1273 			mask = inet_make_mask(key->prefixlen);
1274 			match = (key->addr.a4.s_addr & mask) ==
1275 				(addr->a4.s_addr & mask);
1276 #if IS_ENABLED(CONFIG_IPV6)
1277 		} else if (family == AF_INET6) {
1278 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1279 						  key->prefixlen);
1280 #endif
1281 		} else {
1282 			match = false;
1283 		}
1284 
1285 		if (match && better_md5_match(best_match, key))
1286 			best_match = key;
1287 	}
1288 	return best_match;
1289 }
1290 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1291 
1292 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1293 						      const union tcp_md5_addr *addr,
1294 						      int family, u8 prefixlen,
1295 						      int l3index, u8 flags)
1296 {
1297 	const struct tcp_sock *tp = tcp_sk(sk);
1298 	struct tcp_md5sig_key *key;
1299 	unsigned int size = sizeof(struct in_addr);
1300 	const struct tcp_md5sig_info *md5sig;
1301 
1302 	/* caller either holds rcu_read_lock() or socket lock */
1303 	md5sig = rcu_dereference_check(tp->md5sig_info,
1304 				       lockdep_sock_is_held(sk));
1305 	if (!md5sig)
1306 		return NULL;
1307 #if IS_ENABLED(CONFIG_IPV6)
1308 	if (family == AF_INET6)
1309 		size = sizeof(struct in6_addr);
1310 #endif
1311 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1312 				 lockdep_sock_is_held(sk)) {
1313 		if (key->family != family)
1314 			continue;
1315 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1316 			continue;
1317 		if (key->l3index != l3index)
1318 			continue;
1319 		if (!memcmp(&key->addr, addr, size) &&
1320 		    key->prefixlen == prefixlen)
1321 			return key;
1322 	}
1323 	return NULL;
1324 }
1325 
1326 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1327 					 const struct sock *addr_sk)
1328 {
1329 	const union tcp_md5_addr *addr;
1330 	int l3index;
1331 
1332 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1333 						 addr_sk->sk_bound_dev_if);
1334 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1335 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1336 }
1337 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1338 
1339 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1340 {
1341 	struct tcp_sock *tp = tcp_sk(sk);
1342 	struct tcp_md5sig_info *md5sig;
1343 
1344 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1345 	if (!md5sig)
1346 		return -ENOMEM;
1347 
1348 	sk_gso_disable(sk);
1349 	INIT_HLIST_HEAD(&md5sig->head);
1350 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1351 	return 0;
1352 }
1353 
1354 /* This can be called on a newly created socket, from other files */
1355 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1356 			    int family, u8 prefixlen, int l3index, u8 flags,
1357 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1358 {
1359 	/* Add Key to the list */
1360 	struct tcp_md5sig_key *key;
1361 	struct tcp_sock *tp = tcp_sk(sk);
1362 	struct tcp_md5sig_info *md5sig;
1363 
1364 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1365 	if (key) {
1366 		/* Pre-existing entry - just update that one.
1367 		 * Note that the key might be used concurrently.
1368 		 * data_race() is telling kcsan that we do not care of
1369 		 * key mismatches, since changing MD5 key on live flows
1370 		 * can lead to packet drops.
1371 		 */
1372 		data_race(memcpy(key->key, newkey, newkeylen));
1373 
1374 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1375 		 * Also note that a reader could catch new key->keylen value
1376 		 * but old key->key[], this is the reason we use __GFP_ZERO
1377 		 * at sock_kmalloc() time below these lines.
1378 		 */
1379 		WRITE_ONCE(key->keylen, newkeylen);
1380 
1381 		return 0;
1382 	}
1383 
1384 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1385 					   lockdep_sock_is_held(sk));
1386 
1387 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1388 	if (!key)
1389 		return -ENOMEM;
1390 
1391 	memcpy(key->key, newkey, newkeylen);
1392 	key->keylen = newkeylen;
1393 	key->family = family;
1394 	key->prefixlen = prefixlen;
1395 	key->l3index = l3index;
1396 	key->flags = flags;
1397 	memcpy(&key->addr, addr,
1398 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1399 								 sizeof(struct in_addr));
1400 	hlist_add_head_rcu(&key->node, &md5sig->head);
1401 	return 0;
1402 }
1403 
1404 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1405 		   int family, u8 prefixlen, int l3index, u8 flags,
1406 		   const u8 *newkey, u8 newkeylen)
1407 {
1408 	struct tcp_sock *tp = tcp_sk(sk);
1409 
1410 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1411 		if (tcp_md5_alloc_sigpool())
1412 			return -ENOMEM;
1413 
1414 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1415 			tcp_md5_release_sigpool();
1416 			return -ENOMEM;
1417 		}
1418 
1419 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1420 			struct tcp_md5sig_info *md5sig;
1421 
1422 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1423 			rcu_assign_pointer(tp->md5sig_info, NULL);
1424 			kfree_rcu(md5sig, rcu);
1425 			tcp_md5_release_sigpool();
1426 			return -EUSERS;
1427 		}
1428 	}
1429 
1430 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1431 				newkey, newkeylen, GFP_KERNEL);
1432 }
1433 EXPORT_SYMBOL(tcp_md5_do_add);
1434 
1435 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1436 		     int family, u8 prefixlen, int l3index,
1437 		     struct tcp_md5sig_key *key)
1438 {
1439 	struct tcp_sock *tp = tcp_sk(sk);
1440 
1441 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1442 		tcp_md5_add_sigpool();
1443 
1444 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1445 			tcp_md5_release_sigpool();
1446 			return -ENOMEM;
1447 		}
1448 
1449 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1450 			struct tcp_md5sig_info *md5sig;
1451 
1452 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1453 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1454 			rcu_assign_pointer(tp->md5sig_info, NULL);
1455 			kfree_rcu(md5sig, rcu);
1456 			tcp_md5_release_sigpool();
1457 			return -EUSERS;
1458 		}
1459 	}
1460 
1461 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1462 				key->flags, key->key, key->keylen,
1463 				sk_gfp_mask(sk, GFP_ATOMIC));
1464 }
1465 EXPORT_SYMBOL(tcp_md5_key_copy);
1466 
1467 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1468 		   u8 prefixlen, int l3index, u8 flags)
1469 {
1470 	struct tcp_md5sig_key *key;
1471 
1472 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1473 	if (!key)
1474 		return -ENOENT;
1475 	hlist_del_rcu(&key->node);
1476 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1477 	kfree_rcu(key, rcu);
1478 	return 0;
1479 }
1480 EXPORT_SYMBOL(tcp_md5_do_del);
1481 
1482 void tcp_clear_md5_list(struct sock *sk)
1483 {
1484 	struct tcp_sock *tp = tcp_sk(sk);
1485 	struct tcp_md5sig_key *key;
1486 	struct hlist_node *n;
1487 	struct tcp_md5sig_info *md5sig;
1488 
1489 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1490 
1491 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1492 		hlist_del_rcu(&key->node);
1493 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1494 		kfree_rcu(key, rcu);
1495 	}
1496 }
1497 
1498 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1499 				 sockptr_t optval, int optlen)
1500 {
1501 	struct tcp_md5sig cmd;
1502 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1503 	const union tcp_md5_addr *addr;
1504 	u8 prefixlen = 32;
1505 	int l3index = 0;
1506 	bool l3flag;
1507 	u8 flags;
1508 
1509 	if (optlen < sizeof(cmd))
1510 		return -EINVAL;
1511 
1512 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1513 		return -EFAULT;
1514 
1515 	if (sin->sin_family != AF_INET)
1516 		return -EINVAL;
1517 
1518 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1519 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1520 
1521 	if (optname == TCP_MD5SIG_EXT &&
1522 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1523 		prefixlen = cmd.tcpm_prefixlen;
1524 		if (prefixlen > 32)
1525 			return -EINVAL;
1526 	}
1527 
1528 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1529 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1530 		struct net_device *dev;
1531 
1532 		rcu_read_lock();
1533 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1534 		if (dev && netif_is_l3_master(dev))
1535 			l3index = dev->ifindex;
1536 
1537 		rcu_read_unlock();
1538 
1539 		/* ok to reference set/not set outside of rcu;
1540 		 * right now device MUST be an L3 master
1541 		 */
1542 		if (!dev || !l3index)
1543 			return -EINVAL;
1544 	}
1545 
1546 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1547 
1548 	if (!cmd.tcpm_keylen)
1549 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1550 
1551 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1552 		return -EINVAL;
1553 
1554 	/* Don't allow keys for peers that have a matching TCP-AO key.
1555 	 * See the comment in tcp_ao_add_cmd()
1556 	 */
1557 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1558 		return -EKEYREJECTED;
1559 
1560 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1561 			      cmd.tcpm_key, cmd.tcpm_keylen);
1562 }
1563 
1564 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1565 				   __be32 daddr, __be32 saddr,
1566 				   const struct tcphdr *th, int nbytes)
1567 {
1568 	struct tcp4_pseudohdr *bp;
1569 	struct scatterlist sg;
1570 	struct tcphdr *_th;
1571 
1572 	bp = hp->scratch;
1573 	bp->saddr = saddr;
1574 	bp->daddr = daddr;
1575 	bp->pad = 0;
1576 	bp->protocol = IPPROTO_TCP;
1577 	bp->len = cpu_to_be16(nbytes);
1578 
1579 	_th = (struct tcphdr *)(bp + 1);
1580 	memcpy(_th, th, sizeof(*th));
1581 	_th->check = 0;
1582 
1583 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1584 	ahash_request_set_crypt(hp->req, &sg, NULL,
1585 				sizeof(*bp) + sizeof(*th));
1586 	return crypto_ahash_update(hp->req);
1587 }
1588 
1589 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1590 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1591 {
1592 	struct tcp_sigpool hp;
1593 
1594 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1595 		goto clear_hash_nostart;
1596 
1597 	if (crypto_ahash_init(hp.req))
1598 		goto clear_hash;
1599 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1600 		goto clear_hash;
1601 	if (tcp_md5_hash_key(&hp, key))
1602 		goto clear_hash;
1603 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1604 	if (crypto_ahash_final(hp.req))
1605 		goto clear_hash;
1606 
1607 	tcp_sigpool_end(&hp);
1608 	return 0;
1609 
1610 clear_hash:
1611 	tcp_sigpool_end(&hp);
1612 clear_hash_nostart:
1613 	memset(md5_hash, 0, 16);
1614 	return 1;
1615 }
1616 
1617 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1618 			const struct sock *sk,
1619 			const struct sk_buff *skb)
1620 {
1621 	const struct tcphdr *th = tcp_hdr(skb);
1622 	struct tcp_sigpool hp;
1623 	__be32 saddr, daddr;
1624 
1625 	if (sk) { /* valid for establish/request sockets */
1626 		saddr = sk->sk_rcv_saddr;
1627 		daddr = sk->sk_daddr;
1628 	} else {
1629 		const struct iphdr *iph = ip_hdr(skb);
1630 		saddr = iph->saddr;
1631 		daddr = iph->daddr;
1632 	}
1633 
1634 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1635 		goto clear_hash_nostart;
1636 
1637 	if (crypto_ahash_init(hp.req))
1638 		goto clear_hash;
1639 
1640 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1641 		goto clear_hash;
1642 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1643 		goto clear_hash;
1644 	if (tcp_md5_hash_key(&hp, key))
1645 		goto clear_hash;
1646 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1647 	if (crypto_ahash_final(hp.req))
1648 		goto clear_hash;
1649 
1650 	tcp_sigpool_end(&hp);
1651 	return 0;
1652 
1653 clear_hash:
1654 	tcp_sigpool_end(&hp);
1655 clear_hash_nostart:
1656 	memset(md5_hash, 0, 16);
1657 	return 1;
1658 }
1659 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1660 
1661 #endif
1662 
1663 static void tcp_v4_init_req(struct request_sock *req,
1664 			    const struct sock *sk_listener,
1665 			    struct sk_buff *skb)
1666 {
1667 	struct inet_request_sock *ireq = inet_rsk(req);
1668 	struct net *net = sock_net(sk_listener);
1669 
1670 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1671 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1672 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1673 }
1674 
1675 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1676 					  struct sk_buff *skb,
1677 					  struct flowi *fl,
1678 					  struct request_sock *req,
1679 					  u32 tw_isn)
1680 {
1681 	tcp_v4_init_req(req, sk, skb);
1682 
1683 	if (security_inet_conn_request(sk, skb, req))
1684 		return NULL;
1685 
1686 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1687 }
1688 
1689 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1690 	.family		=	PF_INET,
1691 	.obj_size	=	sizeof(struct tcp_request_sock),
1692 	.rtx_syn_ack	=	tcp_rtx_synack,
1693 	.send_ack	=	tcp_v4_reqsk_send_ack,
1694 	.destructor	=	tcp_v4_reqsk_destructor,
1695 	.send_reset	=	tcp_v4_send_reset,
1696 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1697 };
1698 
1699 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1700 	.mss_clamp	=	TCP_MSS_DEFAULT,
1701 #ifdef CONFIG_TCP_MD5SIG
1702 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1703 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1704 #endif
1705 #ifdef CONFIG_TCP_AO
1706 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1707 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1708 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1709 #endif
1710 #ifdef CONFIG_SYN_COOKIES
1711 	.cookie_init_seq =	cookie_v4_init_sequence,
1712 #endif
1713 	.route_req	=	tcp_v4_route_req,
1714 	.init_seq	=	tcp_v4_init_seq,
1715 	.init_ts_off	=	tcp_v4_init_ts_off,
1716 	.send_synack	=	tcp_v4_send_synack,
1717 };
1718 
1719 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1720 {
1721 	/* Never answer to SYNs send to broadcast or multicast */
1722 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1723 		goto drop;
1724 
1725 	return tcp_conn_request(&tcp_request_sock_ops,
1726 				&tcp_request_sock_ipv4_ops, sk, skb);
1727 
1728 drop:
1729 	tcp_listendrop(sk);
1730 	return 0;
1731 }
1732 EXPORT_SYMBOL(tcp_v4_conn_request);
1733 
1734 
1735 /*
1736  * The three way handshake has completed - we got a valid synack -
1737  * now create the new socket.
1738  */
1739 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1740 				  struct request_sock *req,
1741 				  struct dst_entry *dst,
1742 				  struct request_sock *req_unhash,
1743 				  bool *own_req)
1744 {
1745 	struct inet_request_sock *ireq;
1746 	bool found_dup_sk = false;
1747 	struct inet_sock *newinet;
1748 	struct tcp_sock *newtp;
1749 	struct sock *newsk;
1750 #ifdef CONFIG_TCP_MD5SIG
1751 	const union tcp_md5_addr *addr;
1752 	struct tcp_md5sig_key *key;
1753 	int l3index;
1754 #endif
1755 	struct ip_options_rcu *inet_opt;
1756 
1757 	if (sk_acceptq_is_full(sk))
1758 		goto exit_overflow;
1759 
1760 	newsk = tcp_create_openreq_child(sk, req, skb);
1761 	if (!newsk)
1762 		goto exit_nonewsk;
1763 
1764 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1765 	inet_sk_rx_dst_set(newsk, skb);
1766 
1767 	newtp		      = tcp_sk(newsk);
1768 	newinet		      = inet_sk(newsk);
1769 	ireq		      = inet_rsk(req);
1770 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1771 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1772 	newsk->sk_bound_dev_if = ireq->ir_iif;
1773 	newinet->inet_saddr   = ireq->ir_loc_addr;
1774 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1775 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1776 	newinet->mc_index     = inet_iif(skb);
1777 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1778 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1779 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1780 	if (inet_opt)
1781 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1782 	atomic_set(&newinet->inet_id, get_random_u16());
1783 
1784 	/* Set ToS of the new socket based upon the value of incoming SYN.
1785 	 * ECT bits are set later in tcp_init_transfer().
1786 	 */
1787 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1788 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1789 
1790 	if (!dst) {
1791 		dst = inet_csk_route_child_sock(sk, newsk, req);
1792 		if (!dst)
1793 			goto put_and_exit;
1794 	} else {
1795 		/* syncookie case : see end of cookie_v4_check() */
1796 	}
1797 	sk_setup_caps(newsk, dst);
1798 
1799 	tcp_ca_openreq_child(newsk, dst);
1800 
1801 	tcp_sync_mss(newsk, dst_mtu(dst));
1802 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1803 
1804 	tcp_initialize_rcv_mss(newsk);
1805 
1806 #ifdef CONFIG_TCP_MD5SIG
1807 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1808 	/* Copy over the MD5 key from the original socket */
1809 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1810 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1811 	if (key && !tcp_rsk_used_ao(req)) {
1812 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1813 			goto put_and_exit;
1814 		sk_gso_disable(newsk);
1815 	}
1816 #endif
1817 #ifdef CONFIG_TCP_AO
1818 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1819 		goto put_and_exit; /* OOM, release back memory */
1820 #endif
1821 
1822 	if (__inet_inherit_port(sk, newsk) < 0)
1823 		goto put_and_exit;
1824 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1825 				       &found_dup_sk);
1826 	if (likely(*own_req)) {
1827 		tcp_move_syn(newtp, req);
1828 		ireq->ireq_opt = NULL;
1829 	} else {
1830 		newinet->inet_opt = NULL;
1831 
1832 		if (!req_unhash && found_dup_sk) {
1833 			/* This code path should only be executed in the
1834 			 * syncookie case only
1835 			 */
1836 			bh_unlock_sock(newsk);
1837 			sock_put(newsk);
1838 			newsk = NULL;
1839 		}
1840 	}
1841 	return newsk;
1842 
1843 exit_overflow:
1844 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1845 exit_nonewsk:
1846 	dst_release(dst);
1847 exit:
1848 	tcp_listendrop(sk);
1849 	return NULL;
1850 put_and_exit:
1851 	newinet->inet_opt = NULL;
1852 	inet_csk_prepare_forced_close(newsk);
1853 	tcp_done(newsk);
1854 	goto exit;
1855 }
1856 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1857 
1858 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1859 {
1860 #ifdef CONFIG_SYN_COOKIES
1861 	const struct tcphdr *th = tcp_hdr(skb);
1862 
1863 	if (!th->syn)
1864 		sk = cookie_v4_check(sk, skb);
1865 #endif
1866 	return sk;
1867 }
1868 
1869 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1870 			 struct tcphdr *th, u32 *cookie)
1871 {
1872 	u16 mss = 0;
1873 #ifdef CONFIG_SYN_COOKIES
1874 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1875 				    &tcp_request_sock_ipv4_ops, sk, th);
1876 	if (mss) {
1877 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1878 		tcp_synq_overflow(sk);
1879 	}
1880 #endif
1881 	return mss;
1882 }
1883 
1884 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1885 							   u32));
1886 /* The socket must have it's spinlock held when we get
1887  * here, unless it is a TCP_LISTEN socket.
1888  *
1889  * We have a potential double-lock case here, so even when
1890  * doing backlog processing we use the BH locking scheme.
1891  * This is because we cannot sleep with the original spinlock
1892  * held.
1893  */
1894 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1895 {
1896 	enum skb_drop_reason reason;
1897 	struct sock *rsk;
1898 
1899 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1900 		struct dst_entry *dst;
1901 
1902 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1903 						lockdep_sock_is_held(sk));
1904 
1905 		sock_rps_save_rxhash(sk, skb);
1906 		sk_mark_napi_id(sk, skb);
1907 		if (dst) {
1908 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1909 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1910 					     dst, 0)) {
1911 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1912 				dst_release(dst);
1913 			}
1914 		}
1915 		tcp_rcv_established(sk, skb);
1916 		return 0;
1917 	}
1918 
1919 	if (tcp_checksum_complete(skb))
1920 		goto csum_err;
1921 
1922 	if (sk->sk_state == TCP_LISTEN) {
1923 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1924 
1925 		if (!nsk)
1926 			return 0;
1927 		if (nsk != sk) {
1928 			reason = tcp_child_process(sk, nsk, skb);
1929 			if (reason) {
1930 				rsk = nsk;
1931 				goto reset;
1932 			}
1933 			return 0;
1934 		}
1935 	} else
1936 		sock_rps_save_rxhash(sk, skb);
1937 
1938 	reason = tcp_rcv_state_process(sk, skb);
1939 	if (reason) {
1940 		rsk = sk;
1941 		goto reset;
1942 	}
1943 	return 0;
1944 
1945 reset:
1946 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1947 discard:
1948 	sk_skb_reason_drop(sk, skb, reason);
1949 	/* Be careful here. If this function gets more complicated and
1950 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1951 	 * might be destroyed here. This current version compiles correctly,
1952 	 * but you have been warned.
1953 	 */
1954 	return 0;
1955 
1956 csum_err:
1957 	reason = SKB_DROP_REASON_TCP_CSUM;
1958 	trace_tcp_bad_csum(skb);
1959 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1960 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1961 	goto discard;
1962 }
1963 EXPORT_SYMBOL(tcp_v4_do_rcv);
1964 
1965 int tcp_v4_early_demux(struct sk_buff *skb)
1966 {
1967 	struct net *net = dev_net(skb->dev);
1968 	const struct iphdr *iph;
1969 	const struct tcphdr *th;
1970 	struct sock *sk;
1971 
1972 	if (skb->pkt_type != PACKET_HOST)
1973 		return 0;
1974 
1975 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1976 		return 0;
1977 
1978 	iph = ip_hdr(skb);
1979 	th = tcp_hdr(skb);
1980 
1981 	if (th->doff < sizeof(struct tcphdr) / 4)
1982 		return 0;
1983 
1984 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1985 				       iph->saddr, th->source,
1986 				       iph->daddr, ntohs(th->dest),
1987 				       skb->skb_iif, inet_sdif(skb));
1988 	if (sk) {
1989 		skb->sk = sk;
1990 		skb->destructor = sock_edemux;
1991 		if (sk_fullsock(sk)) {
1992 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1993 
1994 			if (dst)
1995 				dst = dst_check(dst, 0);
1996 			if (dst &&
1997 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1998 				skb_dst_set_noref(skb, dst);
1999 		}
2000 	}
2001 	return 0;
2002 }
2003 
2004 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2005 		     enum skb_drop_reason *reason)
2006 {
2007 	u32 tail_gso_size, tail_gso_segs;
2008 	struct skb_shared_info *shinfo;
2009 	const struct tcphdr *th;
2010 	struct tcphdr *thtail;
2011 	struct sk_buff *tail;
2012 	unsigned int hdrlen;
2013 	bool fragstolen;
2014 	u32 gso_segs;
2015 	u32 gso_size;
2016 	u64 limit;
2017 	int delta;
2018 
2019 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2020 	 * we can fix skb->truesize to its real value to avoid future drops.
2021 	 * This is valid because skb is not yet charged to the socket.
2022 	 * It has been noticed pure SACK packets were sometimes dropped
2023 	 * (if cooked by drivers without copybreak feature).
2024 	 */
2025 	skb_condense(skb);
2026 
2027 	skb_dst_drop(skb);
2028 
2029 	if (unlikely(tcp_checksum_complete(skb))) {
2030 		bh_unlock_sock(sk);
2031 		trace_tcp_bad_csum(skb);
2032 		*reason = SKB_DROP_REASON_TCP_CSUM;
2033 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2034 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2035 		return true;
2036 	}
2037 
2038 	/* Attempt coalescing to last skb in backlog, even if we are
2039 	 * above the limits.
2040 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2041 	 */
2042 	th = (const struct tcphdr *)skb->data;
2043 	hdrlen = th->doff * 4;
2044 
2045 	tail = sk->sk_backlog.tail;
2046 	if (!tail)
2047 		goto no_coalesce;
2048 	thtail = (struct tcphdr *)tail->data;
2049 
2050 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2051 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2052 	    ((TCP_SKB_CB(tail)->tcp_flags |
2053 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2054 	    !((TCP_SKB_CB(tail)->tcp_flags &
2055 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2056 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2057 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2058 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2059 	    thtail->doff != th->doff ||
2060 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2061 		goto no_coalesce;
2062 
2063 	__skb_pull(skb, hdrlen);
2064 
2065 	shinfo = skb_shinfo(skb);
2066 	gso_size = shinfo->gso_size ?: skb->len;
2067 	gso_segs = shinfo->gso_segs ?: 1;
2068 
2069 	shinfo = skb_shinfo(tail);
2070 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2071 	tail_gso_segs = shinfo->gso_segs ?: 1;
2072 
2073 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2074 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2075 
2076 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2077 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2078 			thtail->window = th->window;
2079 		}
2080 
2081 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2082 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2083 		 * is not entered if we append a packet with a FIN.
2084 		 * SYN, RST, URG are not present.
2085 		 * ACK is set on both packets.
2086 		 * PSH : we do not really care in TCP stack,
2087 		 *       at least for 'GRO' packets.
2088 		 */
2089 		thtail->fin |= th->fin;
2090 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2091 
2092 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2093 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2094 			tail->tstamp = skb->tstamp;
2095 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2096 		}
2097 
2098 		/* Not as strict as GRO. We only need to carry mss max value */
2099 		shinfo->gso_size = max(gso_size, tail_gso_size);
2100 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2101 
2102 		sk->sk_backlog.len += delta;
2103 		__NET_INC_STATS(sock_net(sk),
2104 				LINUX_MIB_TCPBACKLOGCOALESCE);
2105 		kfree_skb_partial(skb, fragstolen);
2106 		return false;
2107 	}
2108 	__skb_push(skb, hdrlen);
2109 
2110 no_coalesce:
2111 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2112 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2113 	 * sk_rcvbuf in normal conditions.
2114 	 */
2115 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2116 
2117 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2118 
2119 	/* Only socket owner can try to collapse/prune rx queues
2120 	 * to reduce memory overhead, so add a little headroom here.
2121 	 * Few sockets backlog are possibly concurrently non empty.
2122 	 */
2123 	limit += 64 * 1024;
2124 
2125 	limit = min_t(u64, limit, UINT_MAX);
2126 
2127 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2128 		bh_unlock_sock(sk);
2129 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2130 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2131 		return true;
2132 	}
2133 	return false;
2134 }
2135 EXPORT_SYMBOL(tcp_add_backlog);
2136 
2137 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2138 {
2139 	struct tcphdr *th = (struct tcphdr *)skb->data;
2140 
2141 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2142 }
2143 EXPORT_SYMBOL(tcp_filter);
2144 
2145 static void tcp_v4_restore_cb(struct sk_buff *skb)
2146 {
2147 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2148 		sizeof(struct inet_skb_parm));
2149 }
2150 
2151 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2152 			   const struct tcphdr *th)
2153 {
2154 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2155 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2156 	 */
2157 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2158 		sizeof(struct inet_skb_parm));
2159 	barrier();
2160 
2161 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2162 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2163 				    skb->len - th->doff * 4);
2164 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2165 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2166 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2167 	TCP_SKB_CB(skb)->sacked	 = 0;
2168 	TCP_SKB_CB(skb)->has_rxtstamp =
2169 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2170 }
2171 
2172 /*
2173  *	From tcp_input.c
2174  */
2175 
2176 int tcp_v4_rcv(struct sk_buff *skb)
2177 {
2178 	struct net *net = dev_net(skb->dev);
2179 	enum skb_drop_reason drop_reason;
2180 	int sdif = inet_sdif(skb);
2181 	int dif = inet_iif(skb);
2182 	const struct iphdr *iph;
2183 	const struct tcphdr *th;
2184 	struct sock *sk = NULL;
2185 	bool refcounted;
2186 	int ret;
2187 	u32 isn;
2188 
2189 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2190 	if (skb->pkt_type != PACKET_HOST)
2191 		goto discard_it;
2192 
2193 	/* Count it even if it's bad */
2194 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2195 
2196 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2197 		goto discard_it;
2198 
2199 	th = (const struct tcphdr *)skb->data;
2200 
2201 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2202 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2203 		goto bad_packet;
2204 	}
2205 	if (!pskb_may_pull(skb, th->doff * 4))
2206 		goto discard_it;
2207 
2208 	/* An explanation is required here, I think.
2209 	 * Packet length and doff are validated by header prediction,
2210 	 * provided case of th->doff==0 is eliminated.
2211 	 * So, we defer the checks. */
2212 
2213 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2214 		goto csum_error;
2215 
2216 	th = (const struct tcphdr *)skb->data;
2217 	iph = ip_hdr(skb);
2218 lookup:
2219 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2220 			       skb, __tcp_hdrlen(th), th->source,
2221 			       th->dest, sdif, &refcounted);
2222 	if (!sk)
2223 		goto no_tcp_socket;
2224 
2225 	if (sk->sk_state == TCP_TIME_WAIT)
2226 		goto do_time_wait;
2227 
2228 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2229 		struct request_sock *req = inet_reqsk(sk);
2230 		bool req_stolen = false;
2231 		struct sock *nsk;
2232 
2233 		sk = req->rsk_listener;
2234 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2235 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2236 		else
2237 			drop_reason = tcp_inbound_hash(sk, req, skb,
2238 						       &iph->saddr, &iph->daddr,
2239 						       AF_INET, dif, sdif);
2240 		if (unlikely(drop_reason)) {
2241 			sk_drops_add(sk, skb);
2242 			reqsk_put(req);
2243 			goto discard_it;
2244 		}
2245 		if (tcp_checksum_complete(skb)) {
2246 			reqsk_put(req);
2247 			goto csum_error;
2248 		}
2249 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2250 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2251 			if (!nsk) {
2252 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2253 				goto lookup;
2254 			}
2255 			sk = nsk;
2256 			/* reuseport_migrate_sock() has already held one sk_refcnt
2257 			 * before returning.
2258 			 */
2259 		} else {
2260 			/* We own a reference on the listener, increase it again
2261 			 * as we might lose it too soon.
2262 			 */
2263 			sock_hold(sk);
2264 		}
2265 		refcounted = true;
2266 		nsk = NULL;
2267 		if (!tcp_filter(sk, skb)) {
2268 			th = (const struct tcphdr *)skb->data;
2269 			iph = ip_hdr(skb);
2270 			tcp_v4_fill_cb(skb, iph, th);
2271 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2272 		} else {
2273 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2274 		}
2275 		if (!nsk) {
2276 			reqsk_put(req);
2277 			if (req_stolen) {
2278 				/* Another cpu got exclusive access to req
2279 				 * and created a full blown socket.
2280 				 * Try to feed this packet to this socket
2281 				 * instead of discarding it.
2282 				 */
2283 				tcp_v4_restore_cb(skb);
2284 				sock_put(sk);
2285 				goto lookup;
2286 			}
2287 			goto discard_and_relse;
2288 		}
2289 		nf_reset_ct(skb);
2290 		if (nsk == sk) {
2291 			reqsk_put(req);
2292 			tcp_v4_restore_cb(skb);
2293 		} else {
2294 			drop_reason = tcp_child_process(sk, nsk, skb);
2295 			if (drop_reason) {
2296 				enum sk_rst_reason rst_reason;
2297 
2298 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2299 				tcp_v4_send_reset(nsk, skb, rst_reason);
2300 				goto discard_and_relse;
2301 			}
2302 			sock_put(sk);
2303 			return 0;
2304 		}
2305 	}
2306 
2307 process:
2308 	if (static_branch_unlikely(&ip4_min_ttl)) {
2309 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2310 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2311 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2312 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2313 			goto discard_and_relse;
2314 		}
2315 	}
2316 
2317 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2318 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2319 		goto discard_and_relse;
2320 	}
2321 
2322 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2323 				       AF_INET, dif, sdif);
2324 	if (drop_reason)
2325 		goto discard_and_relse;
2326 
2327 	nf_reset_ct(skb);
2328 
2329 	if (tcp_filter(sk, skb)) {
2330 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2331 		goto discard_and_relse;
2332 	}
2333 	th = (const struct tcphdr *)skb->data;
2334 	iph = ip_hdr(skb);
2335 	tcp_v4_fill_cb(skb, iph, th);
2336 
2337 	skb->dev = NULL;
2338 
2339 	if (sk->sk_state == TCP_LISTEN) {
2340 		ret = tcp_v4_do_rcv(sk, skb);
2341 		goto put_and_return;
2342 	}
2343 
2344 	sk_incoming_cpu_update(sk);
2345 
2346 	bh_lock_sock_nested(sk);
2347 	tcp_segs_in(tcp_sk(sk), skb);
2348 	ret = 0;
2349 	if (!sock_owned_by_user(sk)) {
2350 		ret = tcp_v4_do_rcv(sk, skb);
2351 	} else {
2352 		if (tcp_add_backlog(sk, skb, &drop_reason))
2353 			goto discard_and_relse;
2354 	}
2355 	bh_unlock_sock(sk);
2356 
2357 put_and_return:
2358 	if (refcounted)
2359 		sock_put(sk);
2360 
2361 	return ret;
2362 
2363 no_tcp_socket:
2364 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2365 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2366 		goto discard_it;
2367 
2368 	tcp_v4_fill_cb(skb, iph, th);
2369 
2370 	if (tcp_checksum_complete(skb)) {
2371 csum_error:
2372 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2373 		trace_tcp_bad_csum(skb);
2374 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2375 bad_packet:
2376 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2377 	} else {
2378 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2379 	}
2380 
2381 discard_it:
2382 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2383 	/* Discard frame. */
2384 	sk_skb_reason_drop(sk, skb, drop_reason);
2385 	return 0;
2386 
2387 discard_and_relse:
2388 	sk_drops_add(sk, skb);
2389 	if (refcounted)
2390 		sock_put(sk);
2391 	goto discard_it;
2392 
2393 do_time_wait:
2394 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2395 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2396 		inet_twsk_put(inet_twsk(sk));
2397 		goto discard_it;
2398 	}
2399 
2400 	tcp_v4_fill_cb(skb, iph, th);
2401 
2402 	if (tcp_checksum_complete(skb)) {
2403 		inet_twsk_put(inet_twsk(sk));
2404 		goto csum_error;
2405 	}
2406 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2407 	case TCP_TW_SYN: {
2408 		struct sock *sk2 = inet_lookup_listener(net,
2409 							net->ipv4.tcp_death_row.hashinfo,
2410 							skb, __tcp_hdrlen(th),
2411 							iph->saddr, th->source,
2412 							iph->daddr, th->dest,
2413 							inet_iif(skb),
2414 							sdif);
2415 		if (sk2) {
2416 			inet_twsk_deschedule_put(inet_twsk(sk));
2417 			sk = sk2;
2418 			tcp_v4_restore_cb(skb);
2419 			refcounted = false;
2420 			__this_cpu_write(tcp_tw_isn, isn);
2421 			goto process;
2422 		}
2423 	}
2424 		/* to ACK */
2425 		fallthrough;
2426 	case TCP_TW_ACK:
2427 		tcp_v4_timewait_ack(sk, skb);
2428 		break;
2429 	case TCP_TW_RST:
2430 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2431 		inet_twsk_deschedule_put(inet_twsk(sk));
2432 		goto discard_it;
2433 	case TCP_TW_SUCCESS:;
2434 	}
2435 	goto discard_it;
2436 }
2437 
2438 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2439 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2440 	.twsk_destructor= tcp_twsk_destructor,
2441 };
2442 
2443 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2444 {
2445 	struct dst_entry *dst = skb_dst(skb);
2446 
2447 	if (dst && dst_hold_safe(dst)) {
2448 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2449 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2450 	}
2451 }
2452 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2453 
2454 const struct inet_connection_sock_af_ops ipv4_specific = {
2455 	.queue_xmit	   = ip_queue_xmit,
2456 	.send_check	   = tcp_v4_send_check,
2457 	.rebuild_header	   = inet_sk_rebuild_header,
2458 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2459 	.conn_request	   = tcp_v4_conn_request,
2460 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2461 	.net_header_len	   = sizeof(struct iphdr),
2462 	.setsockopt	   = ip_setsockopt,
2463 	.getsockopt	   = ip_getsockopt,
2464 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2465 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2466 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2467 };
2468 EXPORT_SYMBOL(ipv4_specific);
2469 
2470 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2471 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2472 #ifdef CONFIG_TCP_MD5SIG
2473 	.md5_lookup		= tcp_v4_md5_lookup,
2474 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2475 	.md5_parse		= tcp_v4_parse_md5_keys,
2476 #endif
2477 #ifdef CONFIG_TCP_AO
2478 	.ao_lookup		= tcp_v4_ao_lookup,
2479 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2480 	.ao_parse		= tcp_v4_parse_ao,
2481 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2482 #endif
2483 };
2484 #endif
2485 
2486 /* NOTE: A lot of things set to zero explicitly by call to
2487  *       sk_alloc() so need not be done here.
2488  */
2489 static int tcp_v4_init_sock(struct sock *sk)
2490 {
2491 	struct inet_connection_sock *icsk = inet_csk(sk);
2492 
2493 	tcp_init_sock(sk);
2494 
2495 	icsk->icsk_af_ops = &ipv4_specific;
2496 
2497 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2498 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2499 #endif
2500 
2501 	return 0;
2502 }
2503 
2504 #ifdef CONFIG_TCP_MD5SIG
2505 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2506 {
2507 	struct tcp_md5sig_info *md5sig;
2508 
2509 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2510 	kfree(md5sig);
2511 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2512 	tcp_md5_release_sigpool();
2513 }
2514 #endif
2515 
2516 static void tcp_release_user_frags(struct sock *sk)
2517 {
2518 #ifdef CONFIG_PAGE_POOL
2519 	unsigned long index;
2520 	void *netmem;
2521 
2522 	xa_for_each(&sk->sk_user_frags, index, netmem)
2523 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2524 #endif
2525 }
2526 
2527 void tcp_v4_destroy_sock(struct sock *sk)
2528 {
2529 	struct tcp_sock *tp = tcp_sk(sk);
2530 
2531 	tcp_release_user_frags(sk);
2532 
2533 	xa_destroy(&sk->sk_user_frags);
2534 
2535 	trace_tcp_destroy_sock(sk);
2536 
2537 	tcp_clear_xmit_timers(sk);
2538 
2539 	tcp_cleanup_congestion_control(sk);
2540 
2541 	tcp_cleanup_ulp(sk);
2542 
2543 	/* Cleanup up the write buffer. */
2544 	tcp_write_queue_purge(sk);
2545 
2546 	/* Check if we want to disable active TFO */
2547 	tcp_fastopen_active_disable_ofo_check(sk);
2548 
2549 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2550 	skb_rbtree_purge(&tp->out_of_order_queue);
2551 
2552 #ifdef CONFIG_TCP_MD5SIG
2553 	/* Clean up the MD5 key list, if any */
2554 	if (tp->md5sig_info) {
2555 		struct tcp_md5sig_info *md5sig;
2556 
2557 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2558 		tcp_clear_md5_list(sk);
2559 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2560 		rcu_assign_pointer(tp->md5sig_info, NULL);
2561 	}
2562 #endif
2563 	tcp_ao_destroy_sock(sk, false);
2564 
2565 	/* Clean up a referenced TCP bind bucket. */
2566 	if (inet_csk(sk)->icsk_bind_hash)
2567 		inet_put_port(sk);
2568 
2569 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2570 
2571 	/* If socket is aborted during connect operation */
2572 	tcp_free_fastopen_req(tp);
2573 	tcp_fastopen_destroy_cipher(sk);
2574 	tcp_saved_syn_free(tp);
2575 
2576 	sk_sockets_allocated_dec(sk);
2577 }
2578 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2579 
2580 #ifdef CONFIG_PROC_FS
2581 /* Proc filesystem TCP sock list dumping. */
2582 
2583 static unsigned short seq_file_family(const struct seq_file *seq);
2584 
2585 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2586 {
2587 	unsigned short family = seq_file_family(seq);
2588 
2589 	/* AF_UNSPEC is used as a match all */
2590 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2591 		net_eq(sock_net(sk), seq_file_net(seq)));
2592 }
2593 
2594 /* Find a non empty bucket (starting from st->bucket)
2595  * and return the first sk from it.
2596  */
2597 static void *listening_get_first(struct seq_file *seq)
2598 {
2599 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2600 	struct tcp_iter_state *st = seq->private;
2601 
2602 	st->offset = 0;
2603 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2604 		struct inet_listen_hashbucket *ilb2;
2605 		struct hlist_nulls_node *node;
2606 		struct sock *sk;
2607 
2608 		ilb2 = &hinfo->lhash2[st->bucket];
2609 		if (hlist_nulls_empty(&ilb2->nulls_head))
2610 			continue;
2611 
2612 		spin_lock(&ilb2->lock);
2613 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2614 			if (seq_sk_match(seq, sk))
2615 				return sk;
2616 		}
2617 		spin_unlock(&ilb2->lock);
2618 	}
2619 
2620 	return NULL;
2621 }
2622 
2623 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2624  * If "cur" is the last one in the st->bucket,
2625  * call listening_get_first() to return the first sk of the next
2626  * non empty bucket.
2627  */
2628 static void *listening_get_next(struct seq_file *seq, void *cur)
2629 {
2630 	struct tcp_iter_state *st = seq->private;
2631 	struct inet_listen_hashbucket *ilb2;
2632 	struct hlist_nulls_node *node;
2633 	struct inet_hashinfo *hinfo;
2634 	struct sock *sk = cur;
2635 
2636 	++st->num;
2637 	++st->offset;
2638 
2639 	sk = sk_nulls_next(sk);
2640 	sk_nulls_for_each_from(sk, node) {
2641 		if (seq_sk_match(seq, sk))
2642 			return sk;
2643 	}
2644 
2645 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2646 	ilb2 = &hinfo->lhash2[st->bucket];
2647 	spin_unlock(&ilb2->lock);
2648 	++st->bucket;
2649 	return listening_get_first(seq);
2650 }
2651 
2652 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2653 {
2654 	struct tcp_iter_state *st = seq->private;
2655 	void *rc;
2656 
2657 	st->bucket = 0;
2658 	st->offset = 0;
2659 	rc = listening_get_first(seq);
2660 
2661 	while (rc && *pos) {
2662 		rc = listening_get_next(seq, rc);
2663 		--*pos;
2664 	}
2665 	return rc;
2666 }
2667 
2668 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2669 				const struct tcp_iter_state *st)
2670 {
2671 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2672 }
2673 
2674 /*
2675  * Get first established socket starting from bucket given in st->bucket.
2676  * If st->bucket is zero, the very first socket in the hash is returned.
2677  */
2678 static void *established_get_first(struct seq_file *seq)
2679 {
2680 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2681 	struct tcp_iter_state *st = seq->private;
2682 
2683 	st->offset = 0;
2684 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2685 		struct sock *sk;
2686 		struct hlist_nulls_node *node;
2687 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2688 
2689 		cond_resched();
2690 
2691 		/* Lockless fast path for the common case of empty buckets */
2692 		if (empty_bucket(hinfo, st))
2693 			continue;
2694 
2695 		spin_lock_bh(lock);
2696 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2697 			if (seq_sk_match(seq, sk))
2698 				return sk;
2699 		}
2700 		spin_unlock_bh(lock);
2701 	}
2702 
2703 	return NULL;
2704 }
2705 
2706 static void *established_get_next(struct seq_file *seq, void *cur)
2707 {
2708 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2709 	struct tcp_iter_state *st = seq->private;
2710 	struct hlist_nulls_node *node;
2711 	struct sock *sk = cur;
2712 
2713 	++st->num;
2714 	++st->offset;
2715 
2716 	sk = sk_nulls_next(sk);
2717 
2718 	sk_nulls_for_each_from(sk, node) {
2719 		if (seq_sk_match(seq, sk))
2720 			return sk;
2721 	}
2722 
2723 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2724 	++st->bucket;
2725 	return established_get_first(seq);
2726 }
2727 
2728 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2729 {
2730 	struct tcp_iter_state *st = seq->private;
2731 	void *rc;
2732 
2733 	st->bucket = 0;
2734 	rc = established_get_first(seq);
2735 
2736 	while (rc && pos) {
2737 		rc = established_get_next(seq, rc);
2738 		--pos;
2739 	}
2740 	return rc;
2741 }
2742 
2743 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2744 {
2745 	void *rc;
2746 	struct tcp_iter_state *st = seq->private;
2747 
2748 	st->state = TCP_SEQ_STATE_LISTENING;
2749 	rc	  = listening_get_idx(seq, &pos);
2750 
2751 	if (!rc) {
2752 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2753 		rc	  = established_get_idx(seq, pos);
2754 	}
2755 
2756 	return rc;
2757 }
2758 
2759 static void *tcp_seek_last_pos(struct seq_file *seq)
2760 {
2761 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2762 	struct tcp_iter_state *st = seq->private;
2763 	int bucket = st->bucket;
2764 	int offset = st->offset;
2765 	int orig_num = st->num;
2766 	void *rc = NULL;
2767 
2768 	switch (st->state) {
2769 	case TCP_SEQ_STATE_LISTENING:
2770 		if (st->bucket > hinfo->lhash2_mask)
2771 			break;
2772 		rc = listening_get_first(seq);
2773 		while (offset-- && rc && bucket == st->bucket)
2774 			rc = listening_get_next(seq, rc);
2775 		if (rc)
2776 			break;
2777 		st->bucket = 0;
2778 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2779 		fallthrough;
2780 	case TCP_SEQ_STATE_ESTABLISHED:
2781 		if (st->bucket > hinfo->ehash_mask)
2782 			break;
2783 		rc = established_get_first(seq);
2784 		while (offset-- && rc && bucket == st->bucket)
2785 			rc = established_get_next(seq, rc);
2786 	}
2787 
2788 	st->num = orig_num;
2789 
2790 	return rc;
2791 }
2792 
2793 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2794 {
2795 	struct tcp_iter_state *st = seq->private;
2796 	void *rc;
2797 
2798 	if (*pos && *pos == st->last_pos) {
2799 		rc = tcp_seek_last_pos(seq);
2800 		if (rc)
2801 			goto out;
2802 	}
2803 
2804 	st->state = TCP_SEQ_STATE_LISTENING;
2805 	st->num = 0;
2806 	st->bucket = 0;
2807 	st->offset = 0;
2808 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2809 
2810 out:
2811 	st->last_pos = *pos;
2812 	return rc;
2813 }
2814 EXPORT_SYMBOL(tcp_seq_start);
2815 
2816 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2817 {
2818 	struct tcp_iter_state *st = seq->private;
2819 	void *rc = NULL;
2820 
2821 	if (v == SEQ_START_TOKEN) {
2822 		rc = tcp_get_idx(seq, 0);
2823 		goto out;
2824 	}
2825 
2826 	switch (st->state) {
2827 	case TCP_SEQ_STATE_LISTENING:
2828 		rc = listening_get_next(seq, v);
2829 		if (!rc) {
2830 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2831 			st->bucket = 0;
2832 			st->offset = 0;
2833 			rc	  = established_get_first(seq);
2834 		}
2835 		break;
2836 	case TCP_SEQ_STATE_ESTABLISHED:
2837 		rc = established_get_next(seq, v);
2838 		break;
2839 	}
2840 out:
2841 	++*pos;
2842 	st->last_pos = *pos;
2843 	return rc;
2844 }
2845 EXPORT_SYMBOL(tcp_seq_next);
2846 
2847 void tcp_seq_stop(struct seq_file *seq, void *v)
2848 {
2849 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2850 	struct tcp_iter_state *st = seq->private;
2851 
2852 	switch (st->state) {
2853 	case TCP_SEQ_STATE_LISTENING:
2854 		if (v != SEQ_START_TOKEN)
2855 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2856 		break;
2857 	case TCP_SEQ_STATE_ESTABLISHED:
2858 		if (v)
2859 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2860 		break;
2861 	}
2862 }
2863 EXPORT_SYMBOL(tcp_seq_stop);
2864 
2865 static void get_openreq4(const struct request_sock *req,
2866 			 struct seq_file *f, int i)
2867 {
2868 	const struct inet_request_sock *ireq = inet_rsk(req);
2869 	long delta = req->rsk_timer.expires - jiffies;
2870 
2871 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2872 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2873 		i,
2874 		ireq->ir_loc_addr,
2875 		ireq->ir_num,
2876 		ireq->ir_rmt_addr,
2877 		ntohs(ireq->ir_rmt_port),
2878 		TCP_SYN_RECV,
2879 		0, 0, /* could print option size, but that is af dependent. */
2880 		1,    /* timers active (only the expire timer) */
2881 		jiffies_delta_to_clock_t(delta),
2882 		req->num_timeout,
2883 		from_kuid_munged(seq_user_ns(f),
2884 				 sock_i_uid(req->rsk_listener)),
2885 		0,  /* non standard timer */
2886 		0, /* open_requests have no inode */
2887 		0,
2888 		req);
2889 }
2890 
2891 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2892 {
2893 	int timer_active;
2894 	unsigned long timer_expires;
2895 	const struct tcp_sock *tp = tcp_sk(sk);
2896 	const struct inet_connection_sock *icsk = inet_csk(sk);
2897 	const struct inet_sock *inet = inet_sk(sk);
2898 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2899 	__be32 dest = inet->inet_daddr;
2900 	__be32 src = inet->inet_rcv_saddr;
2901 	__u16 destp = ntohs(inet->inet_dport);
2902 	__u16 srcp = ntohs(inet->inet_sport);
2903 	int rx_queue;
2904 	int state;
2905 
2906 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2907 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2908 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2909 		timer_active	= 1;
2910 		timer_expires	= icsk->icsk_timeout;
2911 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2912 		timer_active	= 4;
2913 		timer_expires	= icsk->icsk_timeout;
2914 	} else if (timer_pending(&sk->sk_timer)) {
2915 		timer_active	= 2;
2916 		timer_expires	= sk->sk_timer.expires;
2917 	} else {
2918 		timer_active	= 0;
2919 		timer_expires = jiffies;
2920 	}
2921 
2922 	state = inet_sk_state_load(sk);
2923 	if (state == TCP_LISTEN)
2924 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2925 	else
2926 		/* Because we don't lock the socket,
2927 		 * we might find a transient negative value.
2928 		 */
2929 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2930 				      READ_ONCE(tp->copied_seq), 0);
2931 
2932 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2933 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2934 		i, src, srcp, dest, destp, state,
2935 		READ_ONCE(tp->write_seq) - tp->snd_una,
2936 		rx_queue,
2937 		timer_active,
2938 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2939 		icsk->icsk_retransmits,
2940 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2941 		icsk->icsk_probes_out,
2942 		sock_i_ino(sk),
2943 		refcount_read(&sk->sk_refcnt), sk,
2944 		jiffies_to_clock_t(icsk->icsk_rto),
2945 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2946 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2947 		tcp_snd_cwnd(tp),
2948 		state == TCP_LISTEN ?
2949 		    fastopenq->max_qlen :
2950 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2951 }
2952 
2953 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2954 			       struct seq_file *f, int i)
2955 {
2956 	long delta = tw->tw_timer.expires - jiffies;
2957 	__be32 dest, src;
2958 	__u16 destp, srcp;
2959 
2960 	dest  = tw->tw_daddr;
2961 	src   = tw->tw_rcv_saddr;
2962 	destp = ntohs(tw->tw_dport);
2963 	srcp  = ntohs(tw->tw_sport);
2964 
2965 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2966 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2967 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2968 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2969 		refcount_read(&tw->tw_refcnt), tw);
2970 }
2971 
2972 #define TMPSZ 150
2973 
2974 static int tcp4_seq_show(struct seq_file *seq, void *v)
2975 {
2976 	struct tcp_iter_state *st;
2977 	struct sock *sk = v;
2978 
2979 	seq_setwidth(seq, TMPSZ - 1);
2980 	if (v == SEQ_START_TOKEN) {
2981 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2982 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2983 			   "inode");
2984 		goto out;
2985 	}
2986 	st = seq->private;
2987 
2988 	if (sk->sk_state == TCP_TIME_WAIT)
2989 		get_timewait4_sock(v, seq, st->num);
2990 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2991 		get_openreq4(v, seq, st->num);
2992 	else
2993 		get_tcp4_sock(v, seq, st->num);
2994 out:
2995 	seq_pad(seq, '\n');
2996 	return 0;
2997 }
2998 
2999 #ifdef CONFIG_BPF_SYSCALL
3000 struct bpf_tcp_iter_state {
3001 	struct tcp_iter_state state;
3002 	unsigned int cur_sk;
3003 	unsigned int end_sk;
3004 	unsigned int max_sk;
3005 	struct sock **batch;
3006 	bool st_bucket_done;
3007 };
3008 
3009 struct bpf_iter__tcp {
3010 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3011 	__bpf_md_ptr(struct sock_common *, sk_common);
3012 	uid_t uid __aligned(8);
3013 };
3014 
3015 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3016 			     struct sock_common *sk_common, uid_t uid)
3017 {
3018 	struct bpf_iter__tcp ctx;
3019 
3020 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3021 	ctx.meta = meta;
3022 	ctx.sk_common = sk_common;
3023 	ctx.uid = uid;
3024 	return bpf_iter_run_prog(prog, &ctx);
3025 }
3026 
3027 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3028 {
3029 	while (iter->cur_sk < iter->end_sk)
3030 		sock_gen_put(iter->batch[iter->cur_sk++]);
3031 }
3032 
3033 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3034 				      unsigned int new_batch_sz)
3035 {
3036 	struct sock **new_batch;
3037 
3038 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3039 			     GFP_USER | __GFP_NOWARN);
3040 	if (!new_batch)
3041 		return -ENOMEM;
3042 
3043 	bpf_iter_tcp_put_batch(iter);
3044 	kvfree(iter->batch);
3045 	iter->batch = new_batch;
3046 	iter->max_sk = new_batch_sz;
3047 
3048 	return 0;
3049 }
3050 
3051 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3052 						 struct sock *start_sk)
3053 {
3054 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3055 	struct bpf_tcp_iter_state *iter = seq->private;
3056 	struct tcp_iter_state *st = &iter->state;
3057 	struct hlist_nulls_node *node;
3058 	unsigned int expected = 1;
3059 	struct sock *sk;
3060 
3061 	sock_hold(start_sk);
3062 	iter->batch[iter->end_sk++] = start_sk;
3063 
3064 	sk = sk_nulls_next(start_sk);
3065 	sk_nulls_for_each_from(sk, node) {
3066 		if (seq_sk_match(seq, sk)) {
3067 			if (iter->end_sk < iter->max_sk) {
3068 				sock_hold(sk);
3069 				iter->batch[iter->end_sk++] = sk;
3070 			}
3071 			expected++;
3072 		}
3073 	}
3074 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3075 
3076 	return expected;
3077 }
3078 
3079 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3080 						   struct sock *start_sk)
3081 {
3082 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3083 	struct bpf_tcp_iter_state *iter = seq->private;
3084 	struct tcp_iter_state *st = &iter->state;
3085 	struct hlist_nulls_node *node;
3086 	unsigned int expected = 1;
3087 	struct sock *sk;
3088 
3089 	sock_hold(start_sk);
3090 	iter->batch[iter->end_sk++] = start_sk;
3091 
3092 	sk = sk_nulls_next(start_sk);
3093 	sk_nulls_for_each_from(sk, node) {
3094 		if (seq_sk_match(seq, sk)) {
3095 			if (iter->end_sk < iter->max_sk) {
3096 				sock_hold(sk);
3097 				iter->batch[iter->end_sk++] = sk;
3098 			}
3099 			expected++;
3100 		}
3101 	}
3102 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3103 
3104 	return expected;
3105 }
3106 
3107 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3108 {
3109 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3110 	struct bpf_tcp_iter_state *iter = seq->private;
3111 	struct tcp_iter_state *st = &iter->state;
3112 	unsigned int expected;
3113 	bool resized = false;
3114 	struct sock *sk;
3115 
3116 	/* The st->bucket is done.  Directly advance to the next
3117 	 * bucket instead of having the tcp_seek_last_pos() to skip
3118 	 * one by one in the current bucket and eventually find out
3119 	 * it has to advance to the next bucket.
3120 	 */
3121 	if (iter->st_bucket_done) {
3122 		st->offset = 0;
3123 		st->bucket++;
3124 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3125 		    st->bucket > hinfo->lhash2_mask) {
3126 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3127 			st->bucket = 0;
3128 		}
3129 	}
3130 
3131 again:
3132 	/* Get a new batch */
3133 	iter->cur_sk = 0;
3134 	iter->end_sk = 0;
3135 	iter->st_bucket_done = false;
3136 
3137 	sk = tcp_seek_last_pos(seq);
3138 	if (!sk)
3139 		return NULL; /* Done */
3140 
3141 	if (st->state == TCP_SEQ_STATE_LISTENING)
3142 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3143 	else
3144 		expected = bpf_iter_tcp_established_batch(seq, sk);
3145 
3146 	if (iter->end_sk == expected) {
3147 		iter->st_bucket_done = true;
3148 		return sk;
3149 	}
3150 
3151 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3152 		resized = true;
3153 		goto again;
3154 	}
3155 
3156 	return sk;
3157 }
3158 
3159 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3160 {
3161 	/* bpf iter does not support lseek, so it always
3162 	 * continue from where it was stop()-ped.
3163 	 */
3164 	if (*pos)
3165 		return bpf_iter_tcp_batch(seq);
3166 
3167 	return SEQ_START_TOKEN;
3168 }
3169 
3170 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3171 {
3172 	struct bpf_tcp_iter_state *iter = seq->private;
3173 	struct tcp_iter_state *st = &iter->state;
3174 	struct sock *sk;
3175 
3176 	/* Whenever seq_next() is called, the iter->cur_sk is
3177 	 * done with seq_show(), so advance to the next sk in
3178 	 * the batch.
3179 	 */
3180 	if (iter->cur_sk < iter->end_sk) {
3181 		/* Keeping st->num consistent in tcp_iter_state.
3182 		 * bpf_iter_tcp does not use st->num.
3183 		 * meta.seq_num is used instead.
3184 		 */
3185 		st->num++;
3186 		/* Move st->offset to the next sk in the bucket such that
3187 		 * the future start() will resume at st->offset in
3188 		 * st->bucket.  See tcp_seek_last_pos().
3189 		 */
3190 		st->offset++;
3191 		sock_gen_put(iter->batch[iter->cur_sk++]);
3192 	}
3193 
3194 	if (iter->cur_sk < iter->end_sk)
3195 		sk = iter->batch[iter->cur_sk];
3196 	else
3197 		sk = bpf_iter_tcp_batch(seq);
3198 
3199 	++*pos;
3200 	/* Keeping st->last_pos consistent in tcp_iter_state.
3201 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3202 	 */
3203 	st->last_pos = *pos;
3204 	return sk;
3205 }
3206 
3207 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3208 {
3209 	struct bpf_iter_meta meta;
3210 	struct bpf_prog *prog;
3211 	struct sock *sk = v;
3212 	uid_t uid;
3213 	int ret;
3214 
3215 	if (v == SEQ_START_TOKEN)
3216 		return 0;
3217 
3218 	if (sk_fullsock(sk))
3219 		lock_sock(sk);
3220 
3221 	if (unlikely(sk_unhashed(sk))) {
3222 		ret = SEQ_SKIP;
3223 		goto unlock;
3224 	}
3225 
3226 	if (sk->sk_state == TCP_TIME_WAIT) {
3227 		uid = 0;
3228 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3229 		const struct request_sock *req = v;
3230 
3231 		uid = from_kuid_munged(seq_user_ns(seq),
3232 				       sock_i_uid(req->rsk_listener));
3233 	} else {
3234 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3235 	}
3236 
3237 	meta.seq = seq;
3238 	prog = bpf_iter_get_info(&meta, false);
3239 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3240 
3241 unlock:
3242 	if (sk_fullsock(sk))
3243 		release_sock(sk);
3244 	return ret;
3245 
3246 }
3247 
3248 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3249 {
3250 	struct bpf_tcp_iter_state *iter = seq->private;
3251 	struct bpf_iter_meta meta;
3252 	struct bpf_prog *prog;
3253 
3254 	if (!v) {
3255 		meta.seq = seq;
3256 		prog = bpf_iter_get_info(&meta, true);
3257 		if (prog)
3258 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3259 	}
3260 
3261 	if (iter->cur_sk < iter->end_sk) {
3262 		bpf_iter_tcp_put_batch(iter);
3263 		iter->st_bucket_done = false;
3264 	}
3265 }
3266 
3267 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3268 	.show		= bpf_iter_tcp_seq_show,
3269 	.start		= bpf_iter_tcp_seq_start,
3270 	.next		= bpf_iter_tcp_seq_next,
3271 	.stop		= bpf_iter_tcp_seq_stop,
3272 };
3273 #endif
3274 static unsigned short seq_file_family(const struct seq_file *seq)
3275 {
3276 	const struct tcp_seq_afinfo *afinfo;
3277 
3278 #ifdef CONFIG_BPF_SYSCALL
3279 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3280 	if (seq->op == &bpf_iter_tcp_seq_ops)
3281 		return AF_UNSPEC;
3282 #endif
3283 
3284 	/* Iterated from proc fs */
3285 	afinfo = pde_data(file_inode(seq->file));
3286 	return afinfo->family;
3287 }
3288 
3289 static const struct seq_operations tcp4_seq_ops = {
3290 	.show		= tcp4_seq_show,
3291 	.start		= tcp_seq_start,
3292 	.next		= tcp_seq_next,
3293 	.stop		= tcp_seq_stop,
3294 };
3295 
3296 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3297 	.family		= AF_INET,
3298 };
3299 
3300 static int __net_init tcp4_proc_init_net(struct net *net)
3301 {
3302 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3303 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3304 		return -ENOMEM;
3305 	return 0;
3306 }
3307 
3308 static void __net_exit tcp4_proc_exit_net(struct net *net)
3309 {
3310 	remove_proc_entry("tcp", net->proc_net);
3311 }
3312 
3313 static struct pernet_operations tcp4_net_ops = {
3314 	.init = tcp4_proc_init_net,
3315 	.exit = tcp4_proc_exit_net,
3316 };
3317 
3318 int __init tcp4_proc_init(void)
3319 {
3320 	return register_pernet_subsys(&tcp4_net_ops);
3321 }
3322 
3323 void tcp4_proc_exit(void)
3324 {
3325 	unregister_pernet_subsys(&tcp4_net_ops);
3326 }
3327 #endif /* CONFIG_PROC_FS */
3328 
3329 /* @wake is one when sk_stream_write_space() calls us.
3330  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3331  * This mimics the strategy used in sock_def_write_space().
3332  */
3333 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3334 {
3335 	const struct tcp_sock *tp = tcp_sk(sk);
3336 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3337 			    READ_ONCE(tp->snd_nxt);
3338 
3339 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3340 }
3341 EXPORT_SYMBOL(tcp_stream_memory_free);
3342 
3343 struct proto tcp_prot = {
3344 	.name			= "TCP",
3345 	.owner			= THIS_MODULE,
3346 	.close			= tcp_close,
3347 	.pre_connect		= tcp_v4_pre_connect,
3348 	.connect		= tcp_v4_connect,
3349 	.disconnect		= tcp_disconnect,
3350 	.accept			= inet_csk_accept,
3351 	.ioctl			= tcp_ioctl,
3352 	.init			= tcp_v4_init_sock,
3353 	.destroy		= tcp_v4_destroy_sock,
3354 	.shutdown		= tcp_shutdown,
3355 	.setsockopt		= tcp_setsockopt,
3356 	.getsockopt		= tcp_getsockopt,
3357 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3358 	.keepalive		= tcp_set_keepalive,
3359 	.recvmsg		= tcp_recvmsg,
3360 	.sendmsg		= tcp_sendmsg,
3361 	.splice_eof		= tcp_splice_eof,
3362 	.backlog_rcv		= tcp_v4_do_rcv,
3363 	.release_cb		= tcp_release_cb,
3364 	.hash			= inet_hash,
3365 	.unhash			= inet_unhash,
3366 	.get_port		= inet_csk_get_port,
3367 	.put_port		= inet_put_port,
3368 #ifdef CONFIG_BPF_SYSCALL
3369 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3370 #endif
3371 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3372 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3373 	.stream_memory_free	= tcp_stream_memory_free,
3374 	.sockets_allocated	= &tcp_sockets_allocated,
3375 	.orphan_count		= &tcp_orphan_count,
3376 
3377 	.memory_allocated	= &tcp_memory_allocated,
3378 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3379 
3380 	.memory_pressure	= &tcp_memory_pressure,
3381 	.sysctl_mem		= sysctl_tcp_mem,
3382 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3383 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3384 	.max_header		= MAX_TCP_HEADER,
3385 	.obj_size		= sizeof(struct tcp_sock),
3386 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3387 	.twsk_prot		= &tcp_timewait_sock_ops,
3388 	.rsk_prot		= &tcp_request_sock_ops,
3389 	.h.hashinfo		= NULL,
3390 	.no_autobind		= true,
3391 	.diag_destroy		= tcp_abort,
3392 };
3393 EXPORT_SYMBOL(tcp_prot);
3394 
3395 static void __net_exit tcp_sk_exit(struct net *net)
3396 {
3397 	if (net->ipv4.tcp_congestion_control)
3398 		bpf_module_put(net->ipv4.tcp_congestion_control,
3399 			       net->ipv4.tcp_congestion_control->owner);
3400 }
3401 
3402 static void __net_init tcp_set_hashinfo(struct net *net)
3403 {
3404 	struct inet_hashinfo *hinfo;
3405 	unsigned int ehash_entries;
3406 	struct net *old_net;
3407 
3408 	if (net_eq(net, &init_net))
3409 		goto fallback;
3410 
3411 	old_net = current->nsproxy->net_ns;
3412 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3413 	if (!ehash_entries)
3414 		goto fallback;
3415 
3416 	ehash_entries = roundup_pow_of_two(ehash_entries);
3417 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3418 	if (!hinfo) {
3419 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3420 			"for a netns, fallback to the global one\n",
3421 			ehash_entries);
3422 fallback:
3423 		hinfo = &tcp_hashinfo;
3424 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3425 	}
3426 
3427 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3428 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3429 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3430 }
3431 
3432 static int __net_init tcp_sk_init(struct net *net)
3433 {
3434 	net->ipv4.sysctl_tcp_ecn = 2;
3435 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3436 
3437 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3438 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3439 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3440 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3441 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3442 
3443 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3444 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3445 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3446 
3447 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3448 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3449 	net->ipv4.sysctl_tcp_syncookies = 1;
3450 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3451 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3452 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3453 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3454 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3455 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3456 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3457 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3458 
3459 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3460 	tcp_set_hashinfo(net);
3461 
3462 	net->ipv4.sysctl_tcp_sack = 1;
3463 	net->ipv4.sysctl_tcp_window_scaling = 1;
3464 	net->ipv4.sysctl_tcp_timestamps = 1;
3465 	net->ipv4.sysctl_tcp_early_retrans = 3;
3466 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3467 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3468 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3469 	net->ipv4.sysctl_tcp_max_reordering = 300;
3470 	net->ipv4.sysctl_tcp_dsack = 1;
3471 	net->ipv4.sysctl_tcp_app_win = 31;
3472 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3473 	net->ipv4.sysctl_tcp_frto = 2;
3474 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3475 	/* This limits the percentage of the congestion window which we
3476 	 * will allow a single TSO frame to consume.  Building TSO frames
3477 	 * which are too large can cause TCP streams to be bursty.
3478 	 */
3479 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3480 	/* Default TSQ limit of 16 TSO segments */
3481 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3482 
3483 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3484 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3485 
3486 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3487 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3488 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3489 	net->ipv4.sysctl_tcp_autocorking = 1;
3490 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3491 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3492 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3493 	if (net != &init_net) {
3494 		memcpy(net->ipv4.sysctl_tcp_rmem,
3495 		       init_net.ipv4.sysctl_tcp_rmem,
3496 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3497 		memcpy(net->ipv4.sysctl_tcp_wmem,
3498 		       init_net.ipv4.sysctl_tcp_wmem,
3499 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3500 	}
3501 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3502 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3503 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3504 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3505 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3506 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3507 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3508 
3509 	/* Set default values for PLB */
3510 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3511 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3512 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3513 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3514 	/* Default congestion threshold for PLB to mark a round is 50% */
3515 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3516 
3517 	/* Reno is always built in */
3518 	if (!net_eq(net, &init_net) &&
3519 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3520 			       init_net.ipv4.tcp_congestion_control->owner))
3521 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3522 	else
3523 		net->ipv4.tcp_congestion_control = &tcp_reno;
3524 
3525 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3526 	net->ipv4.sysctl_tcp_shrink_window = 0;
3527 
3528 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3529 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3530 
3531 	return 0;
3532 }
3533 
3534 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3535 {
3536 	struct net *net;
3537 
3538 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3539 	 * and failed setup_net error unwinding path are serialized.
3540 	 *
3541 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3542 	 * net_exit_list, the thread that dismantles a particular twsk must
3543 	 * do so without other thread progressing to refcount_dec_and_test() of
3544 	 * tcp_death_row.tw_refcount.
3545 	 */
3546 	mutex_lock(&tcp_exit_batch_mutex);
3547 
3548 	tcp_twsk_purge(net_exit_list);
3549 
3550 	list_for_each_entry(net, net_exit_list, exit_list) {
3551 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3552 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3553 		tcp_fastopen_ctx_destroy(net);
3554 	}
3555 
3556 	mutex_unlock(&tcp_exit_batch_mutex);
3557 }
3558 
3559 static struct pernet_operations __net_initdata tcp_sk_ops = {
3560        .init	   = tcp_sk_init,
3561        .exit	   = tcp_sk_exit,
3562        .exit_batch = tcp_sk_exit_batch,
3563 };
3564 
3565 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3566 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3567 		     struct sock_common *sk_common, uid_t uid)
3568 
3569 #define INIT_BATCH_SZ 16
3570 
3571 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3572 {
3573 	struct bpf_tcp_iter_state *iter = priv_data;
3574 	int err;
3575 
3576 	err = bpf_iter_init_seq_net(priv_data, aux);
3577 	if (err)
3578 		return err;
3579 
3580 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3581 	if (err) {
3582 		bpf_iter_fini_seq_net(priv_data);
3583 		return err;
3584 	}
3585 
3586 	return 0;
3587 }
3588 
3589 static void bpf_iter_fini_tcp(void *priv_data)
3590 {
3591 	struct bpf_tcp_iter_state *iter = priv_data;
3592 
3593 	bpf_iter_fini_seq_net(priv_data);
3594 	kvfree(iter->batch);
3595 }
3596 
3597 static const struct bpf_iter_seq_info tcp_seq_info = {
3598 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3599 	.init_seq_private	= bpf_iter_init_tcp,
3600 	.fini_seq_private	= bpf_iter_fini_tcp,
3601 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3602 };
3603 
3604 static const struct bpf_func_proto *
3605 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3606 			    const struct bpf_prog *prog)
3607 {
3608 	switch (func_id) {
3609 	case BPF_FUNC_setsockopt:
3610 		return &bpf_sk_setsockopt_proto;
3611 	case BPF_FUNC_getsockopt:
3612 		return &bpf_sk_getsockopt_proto;
3613 	default:
3614 		return NULL;
3615 	}
3616 }
3617 
3618 static struct bpf_iter_reg tcp_reg_info = {
3619 	.target			= "tcp",
3620 	.ctx_arg_info_size	= 1,
3621 	.ctx_arg_info		= {
3622 		{ offsetof(struct bpf_iter__tcp, sk_common),
3623 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3624 	},
3625 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3626 	.seq_info		= &tcp_seq_info,
3627 };
3628 
3629 static void __init bpf_iter_register(void)
3630 {
3631 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3632 	if (bpf_iter_reg_target(&tcp_reg_info))
3633 		pr_warn("Warning: could not register bpf iterator tcp\n");
3634 }
3635 
3636 #endif
3637 
3638 void __init tcp_v4_init(void)
3639 {
3640 	int cpu, res;
3641 
3642 	for_each_possible_cpu(cpu) {
3643 		struct sock *sk;
3644 
3645 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3646 					   IPPROTO_TCP, &init_net);
3647 		if (res)
3648 			panic("Failed to create the TCP control socket.\n");
3649 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3650 
3651 		/* Please enforce IP_DF and IPID==0 for RST and
3652 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3653 		 */
3654 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3655 
3656 		sk->sk_clockid = CLOCK_MONOTONIC;
3657 
3658 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3659 	}
3660 	if (register_pernet_subsys(&tcp_sk_ops))
3661 		panic("Failed to create the TCP control socket.\n");
3662 
3663 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3664 	bpf_iter_register();
3665 #endif
3666 }
3667