xref: /linux/net/ipv4/tcp_ipv4.c (revision 96f30c8f0aa9923aa39b30bcaefeacf88b490231)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74 
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 
83 #include <crypto/hash.h>
84 #include <linux/scatterlist.h>
85 
86 #include <trace/events/tcp.h>
87 
88 #ifdef CONFIG_TCP_MD5SIG
89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
90 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 #endif
92 
93 struct inet_hashinfo tcp_hashinfo;
94 EXPORT_SYMBOL(tcp_hashinfo);
95 
96 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
97 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
98 };
99 
100 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
101 {
102 	return secure_tcp_seq(ip_hdr(skb)->daddr,
103 			      ip_hdr(skb)->saddr,
104 			      tcp_hdr(skb)->dest,
105 			      tcp_hdr(skb)->source);
106 }
107 
108 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
109 {
110 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
111 }
112 
113 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
114 {
115 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
116 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
117 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
118 	struct tcp_sock *tp = tcp_sk(sk);
119 	int ts_recent_stamp;
120 
121 	if (reuse == 2) {
122 		/* Still does not detect *everything* that goes through
123 		 * lo, since we require a loopback src or dst address
124 		 * or direct binding to 'lo' interface.
125 		 */
126 		bool loopback = false;
127 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
128 			loopback = true;
129 #if IS_ENABLED(CONFIG_IPV6)
130 		if (tw->tw_family == AF_INET6) {
131 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
132 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
133 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
134 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
135 				loopback = true;
136 		} else
137 #endif
138 		{
139 			if (ipv4_is_loopback(tw->tw_daddr) ||
140 			    ipv4_is_loopback(tw->tw_rcv_saddr))
141 				loopback = true;
142 		}
143 		if (!loopback)
144 			reuse = 0;
145 	}
146 
147 	/* With PAWS, it is safe from the viewpoint
148 	   of data integrity. Even without PAWS it is safe provided sequence
149 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
150 
151 	   Actually, the idea is close to VJ's one, only timestamp cache is
152 	   held not per host, but per port pair and TW bucket is used as state
153 	   holder.
154 
155 	   If TW bucket has been already destroyed we fall back to VJ's scheme
156 	   and use initial timestamp retrieved from peer table.
157 	 */
158 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
159 	if (ts_recent_stamp &&
160 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
161 					    ts_recent_stamp)))) {
162 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
163 		 * and releasing the bucket lock.
164 		 */
165 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
166 			return 0;
167 
168 		/* In case of repair and re-using TIME-WAIT sockets we still
169 		 * want to be sure that it is safe as above but honor the
170 		 * sequence numbers and time stamps set as part of the repair
171 		 * process.
172 		 *
173 		 * Without this check re-using a TIME-WAIT socket with TCP
174 		 * repair would accumulate a -1 on the repair assigned
175 		 * sequence number. The first time it is reused the sequence
176 		 * is -1, the second time -2, etc. This fixes that issue
177 		 * without appearing to create any others.
178 		 */
179 		if (likely(!tp->repair)) {
180 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
181 
182 			if (!seq)
183 				seq = 1;
184 			WRITE_ONCE(tp->write_seq, seq);
185 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
186 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
187 		}
188 
189 		return 1;
190 	}
191 
192 	return 0;
193 }
194 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
195 
196 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
197 			      int addr_len)
198 {
199 	/* This check is replicated from tcp_v4_connect() and intended to
200 	 * prevent BPF program called below from accessing bytes that are out
201 	 * of the bound specified by user in addr_len.
202 	 */
203 	if (addr_len < sizeof(struct sockaddr_in))
204 		return -EINVAL;
205 
206 	sock_owned_by_me(sk);
207 
208 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
209 }
210 
211 /* This will initiate an outgoing connection. */
212 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
213 {
214 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
215 	struct inet_timewait_death_row *tcp_death_row;
216 	struct inet_sock *inet = inet_sk(sk);
217 	struct tcp_sock *tp = tcp_sk(sk);
218 	struct ip_options_rcu *inet_opt;
219 	struct net *net = sock_net(sk);
220 	__be16 orig_sport, orig_dport;
221 	__be32 daddr, nexthop;
222 	struct flowi4 *fl4;
223 	struct rtable *rt;
224 	int err;
225 
226 	if (addr_len < sizeof(struct sockaddr_in))
227 		return -EINVAL;
228 
229 	if (usin->sin_family != AF_INET)
230 		return -EAFNOSUPPORT;
231 
232 	nexthop = daddr = usin->sin_addr.s_addr;
233 	inet_opt = rcu_dereference_protected(inet->inet_opt,
234 					     lockdep_sock_is_held(sk));
235 	if (inet_opt && inet_opt->opt.srr) {
236 		if (!daddr)
237 			return -EINVAL;
238 		nexthop = inet_opt->opt.faddr;
239 	}
240 
241 	orig_sport = inet->inet_sport;
242 	orig_dport = usin->sin_port;
243 	fl4 = &inet->cork.fl.u.ip4;
244 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
245 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
246 			      orig_dport, sk);
247 	if (IS_ERR(rt)) {
248 		err = PTR_ERR(rt);
249 		if (err == -ENETUNREACH)
250 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
251 		return err;
252 	}
253 
254 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
255 		ip_rt_put(rt);
256 		return -ENETUNREACH;
257 	}
258 
259 	if (!inet_opt || !inet_opt->opt.srr)
260 		daddr = fl4->daddr;
261 
262 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
263 
264 	if (!inet->inet_saddr) {
265 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
266 		if (err) {
267 			ip_rt_put(rt);
268 			return err;
269 		}
270 	} else {
271 		sk_rcv_saddr_set(sk, inet->inet_saddr);
272 	}
273 
274 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
275 		/* Reset inherited state */
276 		tp->rx_opt.ts_recent	   = 0;
277 		tp->rx_opt.ts_recent_stamp = 0;
278 		if (likely(!tp->repair))
279 			WRITE_ONCE(tp->write_seq, 0);
280 	}
281 
282 	inet->inet_dport = usin->sin_port;
283 	sk_daddr_set(sk, daddr);
284 
285 	inet_csk(sk)->icsk_ext_hdr_len = 0;
286 	if (inet_opt)
287 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
288 
289 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
290 
291 	/* Socket identity is still unknown (sport may be zero).
292 	 * However we set state to SYN-SENT and not releasing socket
293 	 * lock select source port, enter ourselves into the hash tables and
294 	 * complete initialization after this.
295 	 */
296 	tcp_set_state(sk, TCP_SYN_SENT);
297 	err = inet_hash_connect(tcp_death_row, sk);
298 	if (err)
299 		goto failure;
300 
301 	sk_set_txhash(sk);
302 
303 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
304 			       inet->inet_sport, inet->inet_dport, sk);
305 	if (IS_ERR(rt)) {
306 		err = PTR_ERR(rt);
307 		rt = NULL;
308 		goto failure;
309 	}
310 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
311 	/* OK, now commit destination to socket.  */
312 	sk->sk_gso_type = SKB_GSO_TCPV4;
313 	sk_setup_caps(sk, &rt->dst);
314 	rt = NULL;
315 
316 	if (likely(!tp->repair)) {
317 		if (!tp->write_seq)
318 			WRITE_ONCE(tp->write_seq,
319 				   secure_tcp_seq(inet->inet_saddr,
320 						  inet->inet_daddr,
321 						  inet->inet_sport,
322 						  usin->sin_port));
323 		WRITE_ONCE(tp->tsoffset,
324 			   secure_tcp_ts_off(net, inet->inet_saddr,
325 					     inet->inet_daddr));
326 	}
327 
328 	atomic_set(&inet->inet_id, get_random_u16());
329 
330 	if (tcp_fastopen_defer_connect(sk, &err))
331 		return err;
332 	if (err)
333 		goto failure;
334 
335 	err = tcp_connect(sk);
336 
337 	if (err)
338 		goto failure;
339 
340 	return 0;
341 
342 failure:
343 	/*
344 	 * This unhashes the socket and releases the local port,
345 	 * if necessary.
346 	 */
347 	tcp_set_state(sk, TCP_CLOSE);
348 	inet_bhash2_reset_saddr(sk);
349 	ip_rt_put(rt);
350 	sk->sk_route_caps = 0;
351 	inet->inet_dport = 0;
352 	return err;
353 }
354 EXPORT_SYMBOL(tcp_v4_connect);
355 
356 /*
357  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
358  * It can be called through tcp_release_cb() if socket was owned by user
359  * at the time tcp_v4_err() was called to handle ICMP message.
360  */
361 void tcp_v4_mtu_reduced(struct sock *sk)
362 {
363 	struct inet_sock *inet = inet_sk(sk);
364 	struct dst_entry *dst;
365 	u32 mtu;
366 
367 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
368 		return;
369 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
370 	dst = inet_csk_update_pmtu(sk, mtu);
371 	if (!dst)
372 		return;
373 
374 	/* Something is about to be wrong... Remember soft error
375 	 * for the case, if this connection will not able to recover.
376 	 */
377 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
378 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
379 
380 	mtu = dst_mtu(dst);
381 
382 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
383 	    ip_sk_accept_pmtu(sk) &&
384 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
385 		tcp_sync_mss(sk, mtu);
386 
387 		/* Resend the TCP packet because it's
388 		 * clear that the old packet has been
389 		 * dropped. This is the new "fast" path mtu
390 		 * discovery.
391 		 */
392 		tcp_simple_retransmit(sk);
393 	} /* else let the usual retransmit timer handle it */
394 }
395 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
396 
397 static void do_redirect(struct sk_buff *skb, struct sock *sk)
398 {
399 	struct dst_entry *dst = __sk_dst_check(sk, 0);
400 
401 	if (dst)
402 		dst->ops->redirect(dst, sk, skb);
403 }
404 
405 
406 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
407 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
408 {
409 	struct request_sock *req = inet_reqsk(sk);
410 	struct net *net = sock_net(sk);
411 
412 	/* ICMPs are not backlogged, hence we cannot get
413 	 * an established socket here.
414 	 */
415 	if (seq != tcp_rsk(req)->snt_isn) {
416 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
417 	} else if (abort) {
418 		/*
419 		 * Still in SYN_RECV, just remove it silently.
420 		 * There is no good way to pass the error to the newly
421 		 * created socket, and POSIX does not want network
422 		 * errors returned from accept().
423 		 */
424 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
425 		tcp_listendrop(req->rsk_listener);
426 	}
427 	reqsk_put(req);
428 }
429 EXPORT_SYMBOL(tcp_req_err);
430 
431 /* TCP-LD (RFC 6069) logic */
432 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
433 {
434 	struct inet_connection_sock *icsk = inet_csk(sk);
435 	struct tcp_sock *tp = tcp_sk(sk);
436 	struct sk_buff *skb;
437 	s32 remaining;
438 	u32 delta_us;
439 
440 	if (sock_owned_by_user(sk))
441 		return;
442 
443 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
444 	    !icsk->icsk_backoff)
445 		return;
446 
447 	skb = tcp_rtx_queue_head(sk);
448 	if (WARN_ON_ONCE(!skb))
449 		return;
450 
451 	icsk->icsk_backoff--;
452 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
453 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
454 
455 	tcp_mstamp_refresh(tp);
456 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
457 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
458 
459 	if (remaining > 0) {
460 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
461 					  remaining, TCP_RTO_MAX);
462 	} else {
463 		/* RTO revert clocked out retransmission.
464 		 * Will retransmit now.
465 		 */
466 		tcp_retransmit_timer(sk);
467 	}
468 }
469 EXPORT_SYMBOL(tcp_ld_RTO_revert);
470 
471 /*
472  * This routine is called by the ICMP module when it gets some
473  * sort of error condition.  If err < 0 then the socket should
474  * be closed and the error returned to the user.  If err > 0
475  * it's just the icmp type << 8 | icmp code.  After adjustment
476  * header points to the first 8 bytes of the tcp header.  We need
477  * to find the appropriate port.
478  *
479  * The locking strategy used here is very "optimistic". When
480  * someone else accesses the socket the ICMP is just dropped
481  * and for some paths there is no check at all.
482  * A more general error queue to queue errors for later handling
483  * is probably better.
484  *
485  */
486 
487 int tcp_v4_err(struct sk_buff *skb, u32 info)
488 {
489 	const struct iphdr *iph = (const struct iphdr *)skb->data;
490 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
491 	struct tcp_sock *tp;
492 	const int type = icmp_hdr(skb)->type;
493 	const int code = icmp_hdr(skb)->code;
494 	struct sock *sk;
495 	struct request_sock *fastopen;
496 	u32 seq, snd_una;
497 	int err;
498 	struct net *net = dev_net(skb->dev);
499 
500 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
501 				       iph->daddr, th->dest, iph->saddr,
502 				       ntohs(th->source), inet_iif(skb), 0);
503 	if (!sk) {
504 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
505 		return -ENOENT;
506 	}
507 	if (sk->sk_state == TCP_TIME_WAIT) {
508 		/* To increase the counter of ignored icmps for TCP-AO */
509 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
510 		inet_twsk_put(inet_twsk(sk));
511 		return 0;
512 	}
513 	seq = ntohl(th->seq);
514 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
515 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
516 				     type == ICMP_TIME_EXCEEDED ||
517 				     (type == ICMP_DEST_UNREACH &&
518 				      (code == ICMP_NET_UNREACH ||
519 				       code == ICMP_HOST_UNREACH)));
520 		return 0;
521 	}
522 
523 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
524 		sock_put(sk);
525 		return 0;
526 	}
527 
528 	bh_lock_sock(sk);
529 	/* If too many ICMPs get dropped on busy
530 	 * servers this needs to be solved differently.
531 	 * We do take care of PMTU discovery (RFC1191) special case :
532 	 * we can receive locally generated ICMP messages while socket is held.
533 	 */
534 	if (sock_owned_by_user(sk)) {
535 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
536 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
537 	}
538 	if (sk->sk_state == TCP_CLOSE)
539 		goto out;
540 
541 	if (static_branch_unlikely(&ip4_min_ttl)) {
542 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
543 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
544 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
545 			goto out;
546 		}
547 	}
548 
549 	tp = tcp_sk(sk);
550 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
551 	fastopen = rcu_dereference(tp->fastopen_rsk);
552 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
553 	if (sk->sk_state != TCP_LISTEN &&
554 	    !between(seq, snd_una, tp->snd_nxt)) {
555 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
556 		goto out;
557 	}
558 
559 	switch (type) {
560 	case ICMP_REDIRECT:
561 		if (!sock_owned_by_user(sk))
562 			do_redirect(skb, sk);
563 		goto out;
564 	case ICMP_SOURCE_QUENCH:
565 		/* Just silently ignore these. */
566 		goto out;
567 	case ICMP_PARAMETERPROB:
568 		err = EPROTO;
569 		break;
570 	case ICMP_DEST_UNREACH:
571 		if (code > NR_ICMP_UNREACH)
572 			goto out;
573 
574 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
575 			/* We are not interested in TCP_LISTEN and open_requests
576 			 * (SYN-ACKs send out by Linux are always <576bytes so
577 			 * they should go through unfragmented).
578 			 */
579 			if (sk->sk_state == TCP_LISTEN)
580 				goto out;
581 
582 			WRITE_ONCE(tp->mtu_info, info);
583 			if (!sock_owned_by_user(sk)) {
584 				tcp_v4_mtu_reduced(sk);
585 			} else {
586 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
587 					sock_hold(sk);
588 			}
589 			goto out;
590 		}
591 
592 		err = icmp_err_convert[code].errno;
593 		/* check if this ICMP message allows revert of backoff.
594 		 * (see RFC 6069)
595 		 */
596 		if (!fastopen &&
597 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
598 			tcp_ld_RTO_revert(sk, seq);
599 		break;
600 	case ICMP_TIME_EXCEEDED:
601 		err = EHOSTUNREACH;
602 		break;
603 	default:
604 		goto out;
605 	}
606 
607 	switch (sk->sk_state) {
608 	case TCP_SYN_SENT:
609 	case TCP_SYN_RECV:
610 		/* Only in fast or simultaneous open. If a fast open socket is
611 		 * already accepted it is treated as a connected one below.
612 		 */
613 		if (fastopen && !fastopen->sk)
614 			break;
615 
616 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
617 
618 		if (!sock_owned_by_user(sk))
619 			tcp_done_with_error(sk, err);
620 		else
621 			WRITE_ONCE(sk->sk_err_soft, err);
622 		goto out;
623 	}
624 
625 	/* If we've already connected we will keep trying
626 	 * until we time out, or the user gives up.
627 	 *
628 	 * rfc1122 4.2.3.9 allows to consider as hard errors
629 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
630 	 * but it is obsoleted by pmtu discovery).
631 	 *
632 	 * Note, that in modern internet, where routing is unreliable
633 	 * and in each dark corner broken firewalls sit, sending random
634 	 * errors ordered by their masters even this two messages finally lose
635 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
636 	 *
637 	 * Now we are in compliance with RFCs.
638 	 *							--ANK (980905)
639 	 */
640 
641 	if (!sock_owned_by_user(sk) &&
642 	    inet_test_bit(RECVERR, sk)) {
643 		WRITE_ONCE(sk->sk_err, err);
644 		sk_error_report(sk);
645 	} else	{ /* Only an error on timeout */
646 		WRITE_ONCE(sk->sk_err_soft, err);
647 	}
648 
649 out:
650 	bh_unlock_sock(sk);
651 	sock_put(sk);
652 	return 0;
653 }
654 
655 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
656 {
657 	struct tcphdr *th = tcp_hdr(skb);
658 
659 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
660 	skb->csum_start = skb_transport_header(skb) - skb->head;
661 	skb->csum_offset = offsetof(struct tcphdr, check);
662 }
663 
664 /* This routine computes an IPv4 TCP checksum. */
665 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
666 {
667 	const struct inet_sock *inet = inet_sk(sk);
668 
669 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
670 }
671 EXPORT_SYMBOL(tcp_v4_send_check);
672 
673 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
674 
675 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
676 				 const struct tcp_ao_hdr *aoh,
677 				 struct ip_reply_arg *arg, struct tcphdr *reply,
678 				 __be32 reply_options[REPLY_OPTIONS_LEN])
679 {
680 #ifdef CONFIG_TCP_AO
681 	int sdif = tcp_v4_sdif(skb);
682 	int dif = inet_iif(skb);
683 	int l3index = sdif ? dif : 0;
684 	bool allocated_traffic_key;
685 	struct tcp_ao_key *key;
686 	char *traffic_key;
687 	bool drop = true;
688 	u32 ao_sne = 0;
689 	u8 keyid;
690 
691 	rcu_read_lock();
692 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
693 				 &key, &traffic_key, &allocated_traffic_key,
694 				 &keyid, &ao_sne))
695 		goto out;
696 
697 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
698 				 (aoh->rnext_keyid << 8) | keyid);
699 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
700 	reply->doff = arg->iov[0].iov_len / 4;
701 
702 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
703 			    key, traffic_key,
704 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
705 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
706 			    reply, ao_sne))
707 		goto out;
708 	drop = false;
709 out:
710 	rcu_read_unlock();
711 	if (allocated_traffic_key)
712 		kfree(traffic_key);
713 	return drop;
714 #else
715 	return true;
716 #endif
717 }
718 
719 /*
720  *	This routine will send an RST to the other tcp.
721  *
722  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
723  *		      for reset.
724  *	Answer: if a packet caused RST, it is not for a socket
725  *		existing in our system, if it is matched to a socket,
726  *		it is just duplicate segment or bug in other side's TCP.
727  *		So that we build reply only basing on parameters
728  *		arrived with segment.
729  *	Exception: precedence violation. We do not implement it in any case.
730  */
731 
732 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
733 			      enum sk_rst_reason reason)
734 {
735 	const struct tcphdr *th = tcp_hdr(skb);
736 	struct {
737 		struct tcphdr th;
738 		__be32 opt[REPLY_OPTIONS_LEN];
739 	} rep;
740 	const __u8 *md5_hash_location = NULL;
741 	const struct tcp_ao_hdr *aoh;
742 	struct ip_reply_arg arg;
743 #ifdef CONFIG_TCP_MD5SIG
744 	struct tcp_md5sig_key *key = NULL;
745 	unsigned char newhash[16];
746 	struct sock *sk1 = NULL;
747 	int genhash;
748 #endif
749 	u64 transmit_time = 0;
750 	struct sock *ctl_sk;
751 	struct net *net;
752 	u32 txhash = 0;
753 
754 	/* Never send a reset in response to a reset. */
755 	if (th->rst)
756 		return;
757 
758 	/* If sk not NULL, it means we did a successful lookup and incoming
759 	 * route had to be correct. prequeue might have dropped our dst.
760 	 */
761 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
762 		return;
763 
764 	/* Swap the send and the receive. */
765 	memset(&rep, 0, sizeof(rep));
766 	rep.th.dest   = th->source;
767 	rep.th.source = th->dest;
768 	rep.th.doff   = sizeof(struct tcphdr) / 4;
769 	rep.th.rst    = 1;
770 
771 	if (th->ack) {
772 		rep.th.seq = th->ack_seq;
773 	} else {
774 		rep.th.ack = 1;
775 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
776 				       skb->len - (th->doff << 2));
777 	}
778 
779 	memset(&arg, 0, sizeof(arg));
780 	arg.iov[0].iov_base = (unsigned char *)&rep;
781 	arg.iov[0].iov_len  = sizeof(rep.th);
782 
783 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
784 
785 	/* Invalid TCP option size or twice included auth */
786 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
787 		return;
788 
789 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
790 		return;
791 
792 #ifdef CONFIG_TCP_MD5SIG
793 	rcu_read_lock();
794 	if (sk && sk_fullsock(sk)) {
795 		const union tcp_md5_addr *addr;
796 		int l3index;
797 
798 		/* sdif set, means packet ingressed via a device
799 		 * in an L3 domain and inet_iif is set to it.
800 		 */
801 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
802 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
803 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
804 	} else if (md5_hash_location) {
805 		const union tcp_md5_addr *addr;
806 		int sdif = tcp_v4_sdif(skb);
807 		int dif = inet_iif(skb);
808 		int l3index;
809 
810 		/*
811 		 * active side is lost. Try to find listening socket through
812 		 * source port, and then find md5 key through listening socket.
813 		 * we are not loose security here:
814 		 * Incoming packet is checked with md5 hash with finding key,
815 		 * no RST generated if md5 hash doesn't match.
816 		 */
817 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
818 					     NULL, 0, ip_hdr(skb)->saddr,
819 					     th->source, ip_hdr(skb)->daddr,
820 					     ntohs(th->source), dif, sdif);
821 		/* don't send rst if it can't find key */
822 		if (!sk1)
823 			goto out;
824 
825 		/* sdif set, means packet ingressed via a device
826 		 * in an L3 domain and dif is set to it.
827 		 */
828 		l3index = sdif ? dif : 0;
829 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
830 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
831 		if (!key)
832 			goto out;
833 
834 
835 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
836 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
837 			goto out;
838 
839 	}
840 
841 	if (key) {
842 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
843 				   (TCPOPT_NOP << 16) |
844 				   (TCPOPT_MD5SIG << 8) |
845 				   TCPOLEN_MD5SIG);
846 		/* Update length and the length the header thinks exists */
847 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
848 		rep.th.doff = arg.iov[0].iov_len / 4;
849 
850 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
851 				     key, ip_hdr(skb)->saddr,
852 				     ip_hdr(skb)->daddr, &rep.th);
853 	}
854 #endif
855 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
856 	if (rep.opt[0] == 0) {
857 		__be32 mrst = mptcp_reset_option(skb);
858 
859 		if (mrst) {
860 			rep.opt[0] = mrst;
861 			arg.iov[0].iov_len += sizeof(mrst);
862 			rep.th.doff = arg.iov[0].iov_len / 4;
863 		}
864 	}
865 
866 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
867 				      ip_hdr(skb)->saddr, /* XXX */
868 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
869 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
870 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
871 
872 	/* When socket is gone, all binding information is lost.
873 	 * routing might fail in this case. No choice here, if we choose to force
874 	 * input interface, we will misroute in case of asymmetric route.
875 	 */
876 	if (sk)
877 		arg.bound_dev_if = sk->sk_bound_dev_if;
878 
879 	trace_tcp_send_reset(sk, skb, reason);
880 
881 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
882 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
883 
884 	arg.tos = ip_hdr(skb)->tos;
885 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
886 	local_bh_disable();
887 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
888 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
889 
890 	sock_net_set(ctl_sk, net);
891 	if (sk) {
892 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
893 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
894 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
895 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
896 		transmit_time = tcp_transmit_time(sk);
897 		xfrm_sk_clone_policy(ctl_sk, sk);
898 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
899 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
900 	} else {
901 		ctl_sk->sk_mark = 0;
902 		ctl_sk->sk_priority = 0;
903 	}
904 	ip_send_unicast_reply(ctl_sk,
905 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
906 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
907 			      &arg, arg.iov[0].iov_len,
908 			      transmit_time, txhash);
909 
910 	xfrm_sk_free_policy(ctl_sk);
911 	sock_net_set(ctl_sk, &init_net);
912 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
913 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
914 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
915 	local_bh_enable();
916 
917 #ifdef CONFIG_TCP_MD5SIG
918 out:
919 	rcu_read_unlock();
920 #endif
921 }
922 
923 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
924    outside socket context is ugly, certainly. What can I do?
925  */
926 
927 static void tcp_v4_send_ack(const struct sock *sk,
928 			    struct sk_buff *skb, u32 seq, u32 ack,
929 			    u32 win, u32 tsval, u32 tsecr, int oif,
930 			    struct tcp_key *key,
931 			    int reply_flags, u8 tos, u32 txhash)
932 {
933 	const struct tcphdr *th = tcp_hdr(skb);
934 	struct {
935 		struct tcphdr th;
936 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
937 	} rep;
938 	struct net *net = sock_net(sk);
939 	struct ip_reply_arg arg;
940 	struct sock *ctl_sk;
941 	u64 transmit_time;
942 
943 	memset(&rep.th, 0, sizeof(struct tcphdr));
944 	memset(&arg, 0, sizeof(arg));
945 
946 	arg.iov[0].iov_base = (unsigned char *)&rep;
947 	arg.iov[0].iov_len  = sizeof(rep.th);
948 	if (tsecr) {
949 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
950 				   (TCPOPT_TIMESTAMP << 8) |
951 				   TCPOLEN_TIMESTAMP);
952 		rep.opt[1] = htonl(tsval);
953 		rep.opt[2] = htonl(tsecr);
954 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
955 	}
956 
957 	/* Swap the send and the receive. */
958 	rep.th.dest    = th->source;
959 	rep.th.source  = th->dest;
960 	rep.th.doff    = arg.iov[0].iov_len / 4;
961 	rep.th.seq     = htonl(seq);
962 	rep.th.ack_seq = htonl(ack);
963 	rep.th.ack     = 1;
964 	rep.th.window  = htons(win);
965 
966 #ifdef CONFIG_TCP_MD5SIG
967 	if (tcp_key_is_md5(key)) {
968 		int offset = (tsecr) ? 3 : 0;
969 
970 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
971 					  (TCPOPT_NOP << 16) |
972 					  (TCPOPT_MD5SIG << 8) |
973 					  TCPOLEN_MD5SIG);
974 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
975 		rep.th.doff = arg.iov[0].iov_len/4;
976 
977 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
978 				    key->md5_key, ip_hdr(skb)->saddr,
979 				    ip_hdr(skb)->daddr, &rep.th);
980 	}
981 #endif
982 #ifdef CONFIG_TCP_AO
983 	if (tcp_key_is_ao(key)) {
984 		int offset = (tsecr) ? 3 : 0;
985 
986 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
987 					  (tcp_ao_len(key->ao_key) << 16) |
988 					  (key->ao_key->sndid << 8) |
989 					  key->rcv_next);
990 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
991 		rep.th.doff = arg.iov[0].iov_len / 4;
992 
993 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
994 				key->ao_key, key->traffic_key,
995 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
996 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
997 				&rep.th, key->sne);
998 	}
999 #endif
1000 	arg.flags = reply_flags;
1001 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1002 				      ip_hdr(skb)->saddr, /* XXX */
1003 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1004 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1005 	if (oif)
1006 		arg.bound_dev_if = oif;
1007 	arg.tos = tos;
1008 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1009 	local_bh_disable();
1010 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1011 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1012 	sock_net_set(ctl_sk, net);
1013 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1014 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1015 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1016 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1017 	transmit_time = tcp_transmit_time(sk);
1018 	ip_send_unicast_reply(ctl_sk,
1019 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1020 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1021 			      &arg, arg.iov[0].iov_len,
1022 			      transmit_time, txhash);
1023 
1024 	sock_net_set(ctl_sk, &init_net);
1025 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1026 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1027 	local_bh_enable();
1028 }
1029 
1030 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1031 {
1032 	struct inet_timewait_sock *tw = inet_twsk(sk);
1033 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1034 	struct tcp_key key = {};
1035 #ifdef CONFIG_TCP_AO
1036 	struct tcp_ao_info *ao_info;
1037 
1038 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1039 		/* FIXME: the segment to-be-acked is not verified yet */
1040 		ao_info = rcu_dereference(tcptw->ao_info);
1041 		if (ao_info) {
1042 			const struct tcp_ao_hdr *aoh;
1043 
1044 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1045 				inet_twsk_put(tw);
1046 				return;
1047 			}
1048 
1049 			if (aoh)
1050 				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1051 		}
1052 	}
1053 	if (key.ao_key) {
1054 		struct tcp_ao_key *rnext_key;
1055 
1056 		key.traffic_key = snd_other_key(key.ao_key);
1057 		key.sne = READ_ONCE(ao_info->snd_sne);
1058 		rnext_key = READ_ONCE(ao_info->rnext_key);
1059 		key.rcv_next = rnext_key->rcvid;
1060 		key.type = TCP_KEY_AO;
1061 #else
1062 	if (0) {
1063 #endif
1064 	} else if (static_branch_tcp_md5()) {
1065 		key.md5_key = tcp_twsk_md5_key(tcptw);
1066 		if (key.md5_key)
1067 			key.type = TCP_KEY_MD5;
1068 	}
1069 
1070 	tcp_v4_send_ack(sk, skb,
1071 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1072 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1073 			tcp_tw_tsval(tcptw),
1074 			READ_ONCE(tcptw->tw_ts_recent),
1075 			tw->tw_bound_dev_if, &key,
1076 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1077 			tw->tw_tos,
1078 			tw->tw_txhash);
1079 
1080 	inet_twsk_put(tw);
1081 }
1082 
1083 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1084 				  struct request_sock *req)
1085 {
1086 	struct tcp_key key = {};
1087 
1088 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1089 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1090 	 */
1091 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1092 					     tcp_sk(sk)->snd_nxt;
1093 
1094 #ifdef CONFIG_TCP_AO
1095 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1096 	    tcp_rsk_used_ao(req)) {
1097 		const union tcp_md5_addr *addr;
1098 		const struct tcp_ao_hdr *aoh;
1099 		int l3index;
1100 
1101 		/* Invalid TCP option size or twice included auth */
1102 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1103 			return;
1104 		if (!aoh)
1105 			return;
1106 
1107 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1108 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1109 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1110 					      aoh->rnext_keyid, -1);
1111 		if (unlikely(!key.ao_key)) {
1112 			/* Send ACK with any matching MKT for the peer */
1113 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1114 			/* Matching key disappeared (user removed the key?)
1115 			 * let the handshake timeout.
1116 			 */
1117 			if (!key.ao_key) {
1118 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1119 						     addr,
1120 						     ntohs(tcp_hdr(skb)->source),
1121 						     &ip_hdr(skb)->daddr,
1122 						     ntohs(tcp_hdr(skb)->dest));
1123 				return;
1124 			}
1125 		}
1126 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1127 		if (!key.traffic_key)
1128 			return;
1129 
1130 		key.type = TCP_KEY_AO;
1131 		key.rcv_next = aoh->keyid;
1132 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1133 #else
1134 	if (0) {
1135 #endif
1136 	} else if (static_branch_tcp_md5()) {
1137 		const union tcp_md5_addr *addr;
1138 		int l3index;
1139 
1140 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1141 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1142 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1143 		if (key.md5_key)
1144 			key.type = TCP_KEY_MD5;
1145 	}
1146 
1147 	tcp_v4_send_ack(sk, skb, seq,
1148 			tcp_rsk(req)->rcv_nxt,
1149 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1150 			tcp_rsk_tsval(tcp_rsk(req)),
1151 			READ_ONCE(req->ts_recent),
1152 			0, &key,
1153 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1154 			ip_hdr(skb)->tos,
1155 			READ_ONCE(tcp_rsk(req)->txhash));
1156 	if (tcp_key_is_ao(&key))
1157 		kfree(key.traffic_key);
1158 }
1159 
1160 /*
1161  *	Send a SYN-ACK after having received a SYN.
1162  *	This still operates on a request_sock only, not on a big
1163  *	socket.
1164  */
1165 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1166 			      struct flowi *fl,
1167 			      struct request_sock *req,
1168 			      struct tcp_fastopen_cookie *foc,
1169 			      enum tcp_synack_type synack_type,
1170 			      struct sk_buff *syn_skb)
1171 {
1172 	const struct inet_request_sock *ireq = inet_rsk(req);
1173 	struct flowi4 fl4;
1174 	int err = -1;
1175 	struct sk_buff *skb;
1176 	u8 tos;
1177 
1178 	/* First, grab a route. */
1179 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1180 		return -1;
1181 
1182 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1183 
1184 	if (skb) {
1185 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1186 
1187 		tos = READ_ONCE(inet_sk(sk)->tos);
1188 
1189 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1190 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1191 			      (tos & INET_ECN_MASK);
1192 
1193 		if (!INET_ECN_is_capable(tos) &&
1194 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1195 			tos |= INET_ECN_ECT_0;
1196 
1197 		rcu_read_lock();
1198 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1199 					    ireq->ir_rmt_addr,
1200 					    rcu_dereference(ireq->ireq_opt),
1201 					    tos);
1202 		rcu_read_unlock();
1203 		err = net_xmit_eval(err);
1204 	}
1205 
1206 	return err;
1207 }
1208 
1209 /*
1210  *	IPv4 request_sock destructor.
1211  */
1212 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1213 {
1214 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1215 }
1216 
1217 #ifdef CONFIG_TCP_MD5SIG
1218 /*
1219  * RFC2385 MD5 checksumming requires a mapping of
1220  * IP address->MD5 Key.
1221  * We need to maintain these in the sk structure.
1222  */
1223 
1224 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1225 EXPORT_SYMBOL(tcp_md5_needed);
1226 
1227 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1228 {
1229 	if (!old)
1230 		return true;
1231 
1232 	/* l3index always overrides non-l3index */
1233 	if (old->l3index && new->l3index == 0)
1234 		return false;
1235 	if (old->l3index == 0 && new->l3index)
1236 		return true;
1237 
1238 	return old->prefixlen < new->prefixlen;
1239 }
1240 
1241 /* Find the Key structure for an address.  */
1242 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1243 					   const union tcp_md5_addr *addr,
1244 					   int family, bool any_l3index)
1245 {
1246 	const struct tcp_sock *tp = tcp_sk(sk);
1247 	struct tcp_md5sig_key *key;
1248 	const struct tcp_md5sig_info *md5sig;
1249 	__be32 mask;
1250 	struct tcp_md5sig_key *best_match = NULL;
1251 	bool match;
1252 
1253 	/* caller either holds rcu_read_lock() or socket lock */
1254 	md5sig = rcu_dereference_check(tp->md5sig_info,
1255 				       lockdep_sock_is_held(sk));
1256 	if (!md5sig)
1257 		return NULL;
1258 
1259 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1260 				 lockdep_sock_is_held(sk)) {
1261 		if (key->family != family)
1262 			continue;
1263 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1264 		    key->l3index != l3index)
1265 			continue;
1266 		if (family == AF_INET) {
1267 			mask = inet_make_mask(key->prefixlen);
1268 			match = (key->addr.a4.s_addr & mask) ==
1269 				(addr->a4.s_addr & mask);
1270 #if IS_ENABLED(CONFIG_IPV6)
1271 		} else if (family == AF_INET6) {
1272 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1273 						  key->prefixlen);
1274 #endif
1275 		} else {
1276 			match = false;
1277 		}
1278 
1279 		if (match && better_md5_match(best_match, key))
1280 			best_match = key;
1281 	}
1282 	return best_match;
1283 }
1284 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1285 
1286 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1287 						      const union tcp_md5_addr *addr,
1288 						      int family, u8 prefixlen,
1289 						      int l3index, u8 flags)
1290 {
1291 	const struct tcp_sock *tp = tcp_sk(sk);
1292 	struct tcp_md5sig_key *key;
1293 	unsigned int size = sizeof(struct in_addr);
1294 	const struct tcp_md5sig_info *md5sig;
1295 
1296 	/* caller either holds rcu_read_lock() or socket lock */
1297 	md5sig = rcu_dereference_check(tp->md5sig_info,
1298 				       lockdep_sock_is_held(sk));
1299 	if (!md5sig)
1300 		return NULL;
1301 #if IS_ENABLED(CONFIG_IPV6)
1302 	if (family == AF_INET6)
1303 		size = sizeof(struct in6_addr);
1304 #endif
1305 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1306 				 lockdep_sock_is_held(sk)) {
1307 		if (key->family != family)
1308 			continue;
1309 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1310 			continue;
1311 		if (key->l3index != l3index)
1312 			continue;
1313 		if (!memcmp(&key->addr, addr, size) &&
1314 		    key->prefixlen == prefixlen)
1315 			return key;
1316 	}
1317 	return NULL;
1318 }
1319 
1320 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1321 					 const struct sock *addr_sk)
1322 {
1323 	const union tcp_md5_addr *addr;
1324 	int l3index;
1325 
1326 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1327 						 addr_sk->sk_bound_dev_if);
1328 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1329 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1330 }
1331 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1332 
1333 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1334 {
1335 	struct tcp_sock *tp = tcp_sk(sk);
1336 	struct tcp_md5sig_info *md5sig;
1337 
1338 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1339 	if (!md5sig)
1340 		return -ENOMEM;
1341 
1342 	sk_gso_disable(sk);
1343 	INIT_HLIST_HEAD(&md5sig->head);
1344 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1345 	return 0;
1346 }
1347 
1348 /* This can be called on a newly created socket, from other files */
1349 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1350 			    int family, u8 prefixlen, int l3index, u8 flags,
1351 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1352 {
1353 	/* Add Key to the list */
1354 	struct tcp_md5sig_key *key;
1355 	struct tcp_sock *tp = tcp_sk(sk);
1356 	struct tcp_md5sig_info *md5sig;
1357 
1358 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1359 	if (key) {
1360 		/* Pre-existing entry - just update that one.
1361 		 * Note that the key might be used concurrently.
1362 		 * data_race() is telling kcsan that we do not care of
1363 		 * key mismatches, since changing MD5 key on live flows
1364 		 * can lead to packet drops.
1365 		 */
1366 		data_race(memcpy(key->key, newkey, newkeylen));
1367 
1368 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1369 		 * Also note that a reader could catch new key->keylen value
1370 		 * but old key->key[], this is the reason we use __GFP_ZERO
1371 		 * at sock_kmalloc() time below these lines.
1372 		 */
1373 		WRITE_ONCE(key->keylen, newkeylen);
1374 
1375 		return 0;
1376 	}
1377 
1378 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1379 					   lockdep_sock_is_held(sk));
1380 
1381 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1382 	if (!key)
1383 		return -ENOMEM;
1384 
1385 	memcpy(key->key, newkey, newkeylen);
1386 	key->keylen = newkeylen;
1387 	key->family = family;
1388 	key->prefixlen = prefixlen;
1389 	key->l3index = l3index;
1390 	key->flags = flags;
1391 	memcpy(&key->addr, addr,
1392 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1393 								 sizeof(struct in_addr));
1394 	hlist_add_head_rcu(&key->node, &md5sig->head);
1395 	return 0;
1396 }
1397 
1398 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1399 		   int family, u8 prefixlen, int l3index, u8 flags,
1400 		   const u8 *newkey, u8 newkeylen)
1401 {
1402 	struct tcp_sock *tp = tcp_sk(sk);
1403 
1404 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1405 		if (tcp_md5_alloc_sigpool())
1406 			return -ENOMEM;
1407 
1408 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1409 			tcp_md5_release_sigpool();
1410 			return -ENOMEM;
1411 		}
1412 
1413 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1414 			struct tcp_md5sig_info *md5sig;
1415 
1416 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1417 			rcu_assign_pointer(tp->md5sig_info, NULL);
1418 			kfree_rcu(md5sig, rcu);
1419 			tcp_md5_release_sigpool();
1420 			return -EUSERS;
1421 		}
1422 	}
1423 
1424 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1425 				newkey, newkeylen, GFP_KERNEL);
1426 }
1427 EXPORT_SYMBOL(tcp_md5_do_add);
1428 
1429 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1430 		     int family, u8 prefixlen, int l3index,
1431 		     struct tcp_md5sig_key *key)
1432 {
1433 	struct tcp_sock *tp = tcp_sk(sk);
1434 
1435 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1436 		tcp_md5_add_sigpool();
1437 
1438 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1439 			tcp_md5_release_sigpool();
1440 			return -ENOMEM;
1441 		}
1442 
1443 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1444 			struct tcp_md5sig_info *md5sig;
1445 
1446 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1447 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1448 			rcu_assign_pointer(tp->md5sig_info, NULL);
1449 			kfree_rcu(md5sig, rcu);
1450 			tcp_md5_release_sigpool();
1451 			return -EUSERS;
1452 		}
1453 	}
1454 
1455 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1456 				key->flags, key->key, key->keylen,
1457 				sk_gfp_mask(sk, GFP_ATOMIC));
1458 }
1459 EXPORT_SYMBOL(tcp_md5_key_copy);
1460 
1461 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1462 		   u8 prefixlen, int l3index, u8 flags)
1463 {
1464 	struct tcp_md5sig_key *key;
1465 
1466 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1467 	if (!key)
1468 		return -ENOENT;
1469 	hlist_del_rcu(&key->node);
1470 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1471 	kfree_rcu(key, rcu);
1472 	return 0;
1473 }
1474 EXPORT_SYMBOL(tcp_md5_do_del);
1475 
1476 void tcp_clear_md5_list(struct sock *sk)
1477 {
1478 	struct tcp_sock *tp = tcp_sk(sk);
1479 	struct tcp_md5sig_key *key;
1480 	struct hlist_node *n;
1481 	struct tcp_md5sig_info *md5sig;
1482 
1483 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1484 
1485 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1486 		hlist_del_rcu(&key->node);
1487 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1488 		kfree_rcu(key, rcu);
1489 	}
1490 }
1491 
1492 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1493 				 sockptr_t optval, int optlen)
1494 {
1495 	struct tcp_md5sig cmd;
1496 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1497 	const union tcp_md5_addr *addr;
1498 	u8 prefixlen = 32;
1499 	int l3index = 0;
1500 	bool l3flag;
1501 	u8 flags;
1502 
1503 	if (optlen < sizeof(cmd))
1504 		return -EINVAL;
1505 
1506 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1507 		return -EFAULT;
1508 
1509 	if (sin->sin_family != AF_INET)
1510 		return -EINVAL;
1511 
1512 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1513 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1514 
1515 	if (optname == TCP_MD5SIG_EXT &&
1516 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1517 		prefixlen = cmd.tcpm_prefixlen;
1518 		if (prefixlen > 32)
1519 			return -EINVAL;
1520 	}
1521 
1522 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1523 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1524 		struct net_device *dev;
1525 
1526 		rcu_read_lock();
1527 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1528 		if (dev && netif_is_l3_master(dev))
1529 			l3index = dev->ifindex;
1530 
1531 		rcu_read_unlock();
1532 
1533 		/* ok to reference set/not set outside of rcu;
1534 		 * right now device MUST be an L3 master
1535 		 */
1536 		if (!dev || !l3index)
1537 			return -EINVAL;
1538 	}
1539 
1540 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1541 
1542 	if (!cmd.tcpm_keylen)
1543 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1544 
1545 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1546 		return -EINVAL;
1547 
1548 	/* Don't allow keys for peers that have a matching TCP-AO key.
1549 	 * See the comment in tcp_ao_add_cmd()
1550 	 */
1551 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1552 		return -EKEYREJECTED;
1553 
1554 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1555 			      cmd.tcpm_key, cmd.tcpm_keylen);
1556 }
1557 
1558 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1559 				   __be32 daddr, __be32 saddr,
1560 				   const struct tcphdr *th, int nbytes)
1561 {
1562 	struct tcp4_pseudohdr *bp;
1563 	struct scatterlist sg;
1564 	struct tcphdr *_th;
1565 
1566 	bp = hp->scratch;
1567 	bp->saddr = saddr;
1568 	bp->daddr = daddr;
1569 	bp->pad = 0;
1570 	bp->protocol = IPPROTO_TCP;
1571 	bp->len = cpu_to_be16(nbytes);
1572 
1573 	_th = (struct tcphdr *)(bp + 1);
1574 	memcpy(_th, th, sizeof(*th));
1575 	_th->check = 0;
1576 
1577 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1578 	ahash_request_set_crypt(hp->req, &sg, NULL,
1579 				sizeof(*bp) + sizeof(*th));
1580 	return crypto_ahash_update(hp->req);
1581 }
1582 
1583 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1584 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1585 {
1586 	struct tcp_sigpool hp;
1587 
1588 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1589 		goto clear_hash_nostart;
1590 
1591 	if (crypto_ahash_init(hp.req))
1592 		goto clear_hash;
1593 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1594 		goto clear_hash;
1595 	if (tcp_md5_hash_key(&hp, key))
1596 		goto clear_hash;
1597 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1598 	if (crypto_ahash_final(hp.req))
1599 		goto clear_hash;
1600 
1601 	tcp_sigpool_end(&hp);
1602 	return 0;
1603 
1604 clear_hash:
1605 	tcp_sigpool_end(&hp);
1606 clear_hash_nostart:
1607 	memset(md5_hash, 0, 16);
1608 	return 1;
1609 }
1610 
1611 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1612 			const struct sock *sk,
1613 			const struct sk_buff *skb)
1614 {
1615 	const struct tcphdr *th = tcp_hdr(skb);
1616 	struct tcp_sigpool hp;
1617 	__be32 saddr, daddr;
1618 
1619 	if (sk) { /* valid for establish/request sockets */
1620 		saddr = sk->sk_rcv_saddr;
1621 		daddr = sk->sk_daddr;
1622 	} else {
1623 		const struct iphdr *iph = ip_hdr(skb);
1624 		saddr = iph->saddr;
1625 		daddr = iph->daddr;
1626 	}
1627 
1628 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1629 		goto clear_hash_nostart;
1630 
1631 	if (crypto_ahash_init(hp.req))
1632 		goto clear_hash;
1633 
1634 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1635 		goto clear_hash;
1636 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1637 		goto clear_hash;
1638 	if (tcp_md5_hash_key(&hp, key))
1639 		goto clear_hash;
1640 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1641 	if (crypto_ahash_final(hp.req))
1642 		goto clear_hash;
1643 
1644 	tcp_sigpool_end(&hp);
1645 	return 0;
1646 
1647 clear_hash:
1648 	tcp_sigpool_end(&hp);
1649 clear_hash_nostart:
1650 	memset(md5_hash, 0, 16);
1651 	return 1;
1652 }
1653 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1654 
1655 #endif
1656 
1657 static void tcp_v4_init_req(struct request_sock *req,
1658 			    const struct sock *sk_listener,
1659 			    struct sk_buff *skb)
1660 {
1661 	struct inet_request_sock *ireq = inet_rsk(req);
1662 	struct net *net = sock_net(sk_listener);
1663 
1664 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1665 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1666 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1667 }
1668 
1669 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1670 					  struct sk_buff *skb,
1671 					  struct flowi *fl,
1672 					  struct request_sock *req,
1673 					  u32 tw_isn)
1674 {
1675 	tcp_v4_init_req(req, sk, skb);
1676 
1677 	if (security_inet_conn_request(sk, skb, req))
1678 		return NULL;
1679 
1680 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1681 }
1682 
1683 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1684 	.family		=	PF_INET,
1685 	.obj_size	=	sizeof(struct tcp_request_sock),
1686 	.rtx_syn_ack	=	tcp_rtx_synack,
1687 	.send_ack	=	tcp_v4_reqsk_send_ack,
1688 	.destructor	=	tcp_v4_reqsk_destructor,
1689 	.send_reset	=	tcp_v4_send_reset,
1690 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1691 };
1692 
1693 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1694 	.mss_clamp	=	TCP_MSS_DEFAULT,
1695 #ifdef CONFIG_TCP_MD5SIG
1696 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1697 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1698 #endif
1699 #ifdef CONFIG_TCP_AO
1700 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1701 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1702 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1703 #endif
1704 #ifdef CONFIG_SYN_COOKIES
1705 	.cookie_init_seq =	cookie_v4_init_sequence,
1706 #endif
1707 	.route_req	=	tcp_v4_route_req,
1708 	.init_seq	=	tcp_v4_init_seq,
1709 	.init_ts_off	=	tcp_v4_init_ts_off,
1710 	.send_synack	=	tcp_v4_send_synack,
1711 };
1712 
1713 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1714 {
1715 	/* Never answer to SYNs send to broadcast or multicast */
1716 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1717 		goto drop;
1718 
1719 	return tcp_conn_request(&tcp_request_sock_ops,
1720 				&tcp_request_sock_ipv4_ops, sk, skb);
1721 
1722 drop:
1723 	tcp_listendrop(sk);
1724 	return 0;
1725 }
1726 EXPORT_SYMBOL(tcp_v4_conn_request);
1727 
1728 
1729 /*
1730  * The three way handshake has completed - we got a valid synack -
1731  * now create the new socket.
1732  */
1733 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1734 				  struct request_sock *req,
1735 				  struct dst_entry *dst,
1736 				  struct request_sock *req_unhash,
1737 				  bool *own_req)
1738 {
1739 	struct inet_request_sock *ireq;
1740 	bool found_dup_sk = false;
1741 	struct inet_sock *newinet;
1742 	struct tcp_sock *newtp;
1743 	struct sock *newsk;
1744 #ifdef CONFIG_TCP_MD5SIG
1745 	const union tcp_md5_addr *addr;
1746 	struct tcp_md5sig_key *key;
1747 	int l3index;
1748 #endif
1749 	struct ip_options_rcu *inet_opt;
1750 
1751 	if (sk_acceptq_is_full(sk))
1752 		goto exit_overflow;
1753 
1754 	newsk = tcp_create_openreq_child(sk, req, skb);
1755 	if (!newsk)
1756 		goto exit_nonewsk;
1757 
1758 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1759 	inet_sk_rx_dst_set(newsk, skb);
1760 
1761 	newtp		      = tcp_sk(newsk);
1762 	newinet		      = inet_sk(newsk);
1763 	ireq		      = inet_rsk(req);
1764 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1765 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1766 	newsk->sk_bound_dev_if = ireq->ir_iif;
1767 	newinet->inet_saddr   = ireq->ir_loc_addr;
1768 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1769 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1770 	newinet->mc_index     = inet_iif(skb);
1771 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1772 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1773 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1774 	if (inet_opt)
1775 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1776 	atomic_set(&newinet->inet_id, get_random_u16());
1777 
1778 	/* Set ToS of the new socket based upon the value of incoming SYN.
1779 	 * ECT bits are set later in tcp_init_transfer().
1780 	 */
1781 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1782 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1783 
1784 	if (!dst) {
1785 		dst = inet_csk_route_child_sock(sk, newsk, req);
1786 		if (!dst)
1787 			goto put_and_exit;
1788 	} else {
1789 		/* syncookie case : see end of cookie_v4_check() */
1790 	}
1791 	sk_setup_caps(newsk, dst);
1792 
1793 	tcp_ca_openreq_child(newsk, dst);
1794 
1795 	tcp_sync_mss(newsk, dst_mtu(dst));
1796 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1797 
1798 	tcp_initialize_rcv_mss(newsk);
1799 
1800 #ifdef CONFIG_TCP_MD5SIG
1801 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1802 	/* Copy over the MD5 key from the original socket */
1803 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1804 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1805 	if (key && !tcp_rsk_used_ao(req)) {
1806 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1807 			goto put_and_exit;
1808 		sk_gso_disable(newsk);
1809 	}
1810 #endif
1811 #ifdef CONFIG_TCP_AO
1812 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1813 		goto put_and_exit; /* OOM, release back memory */
1814 #endif
1815 
1816 	if (__inet_inherit_port(sk, newsk) < 0)
1817 		goto put_and_exit;
1818 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1819 				       &found_dup_sk);
1820 	if (likely(*own_req)) {
1821 		tcp_move_syn(newtp, req);
1822 		ireq->ireq_opt = NULL;
1823 	} else {
1824 		newinet->inet_opt = NULL;
1825 
1826 		if (!req_unhash && found_dup_sk) {
1827 			/* This code path should only be executed in the
1828 			 * syncookie case only
1829 			 */
1830 			bh_unlock_sock(newsk);
1831 			sock_put(newsk);
1832 			newsk = NULL;
1833 		}
1834 	}
1835 	return newsk;
1836 
1837 exit_overflow:
1838 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1839 exit_nonewsk:
1840 	dst_release(dst);
1841 exit:
1842 	tcp_listendrop(sk);
1843 	return NULL;
1844 put_and_exit:
1845 	newinet->inet_opt = NULL;
1846 	inet_csk_prepare_forced_close(newsk);
1847 	tcp_done(newsk);
1848 	goto exit;
1849 }
1850 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1851 
1852 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1853 {
1854 #ifdef CONFIG_SYN_COOKIES
1855 	const struct tcphdr *th = tcp_hdr(skb);
1856 
1857 	if (!th->syn)
1858 		sk = cookie_v4_check(sk, skb);
1859 #endif
1860 	return sk;
1861 }
1862 
1863 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1864 			 struct tcphdr *th, u32 *cookie)
1865 {
1866 	u16 mss = 0;
1867 #ifdef CONFIG_SYN_COOKIES
1868 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1869 				    &tcp_request_sock_ipv4_ops, sk, th);
1870 	if (mss) {
1871 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1872 		tcp_synq_overflow(sk);
1873 	}
1874 #endif
1875 	return mss;
1876 }
1877 
1878 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1879 							   u32));
1880 /* The socket must have it's spinlock held when we get
1881  * here, unless it is a TCP_LISTEN socket.
1882  *
1883  * We have a potential double-lock case here, so even when
1884  * doing backlog processing we use the BH locking scheme.
1885  * This is because we cannot sleep with the original spinlock
1886  * held.
1887  */
1888 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1889 {
1890 	enum skb_drop_reason reason;
1891 	struct sock *rsk;
1892 
1893 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1894 		struct dst_entry *dst;
1895 
1896 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1897 						lockdep_sock_is_held(sk));
1898 
1899 		sock_rps_save_rxhash(sk, skb);
1900 		sk_mark_napi_id(sk, skb);
1901 		if (dst) {
1902 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1903 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1904 					     dst, 0)) {
1905 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1906 				dst_release(dst);
1907 			}
1908 		}
1909 		tcp_rcv_established(sk, skb);
1910 		return 0;
1911 	}
1912 
1913 	if (tcp_checksum_complete(skb))
1914 		goto csum_err;
1915 
1916 	if (sk->sk_state == TCP_LISTEN) {
1917 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1918 
1919 		if (!nsk)
1920 			return 0;
1921 		if (nsk != sk) {
1922 			reason = tcp_child_process(sk, nsk, skb);
1923 			if (reason) {
1924 				rsk = nsk;
1925 				goto reset;
1926 			}
1927 			return 0;
1928 		}
1929 	} else
1930 		sock_rps_save_rxhash(sk, skb);
1931 
1932 	reason = tcp_rcv_state_process(sk, skb);
1933 	if (reason) {
1934 		rsk = sk;
1935 		goto reset;
1936 	}
1937 	return 0;
1938 
1939 reset:
1940 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1941 discard:
1942 	sk_skb_reason_drop(sk, skb, reason);
1943 	/* Be careful here. If this function gets more complicated and
1944 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1945 	 * might be destroyed here. This current version compiles correctly,
1946 	 * but you have been warned.
1947 	 */
1948 	return 0;
1949 
1950 csum_err:
1951 	reason = SKB_DROP_REASON_TCP_CSUM;
1952 	trace_tcp_bad_csum(skb);
1953 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1954 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1955 	goto discard;
1956 }
1957 EXPORT_SYMBOL(tcp_v4_do_rcv);
1958 
1959 int tcp_v4_early_demux(struct sk_buff *skb)
1960 {
1961 	struct net *net = dev_net(skb->dev);
1962 	const struct iphdr *iph;
1963 	const struct tcphdr *th;
1964 	struct sock *sk;
1965 
1966 	if (skb->pkt_type != PACKET_HOST)
1967 		return 0;
1968 
1969 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1970 		return 0;
1971 
1972 	iph = ip_hdr(skb);
1973 	th = tcp_hdr(skb);
1974 
1975 	if (th->doff < sizeof(struct tcphdr) / 4)
1976 		return 0;
1977 
1978 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1979 				       iph->saddr, th->source,
1980 				       iph->daddr, ntohs(th->dest),
1981 				       skb->skb_iif, inet_sdif(skb));
1982 	if (sk) {
1983 		skb->sk = sk;
1984 		skb->destructor = sock_edemux;
1985 		if (sk_fullsock(sk)) {
1986 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1987 
1988 			if (dst)
1989 				dst = dst_check(dst, 0);
1990 			if (dst &&
1991 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1992 				skb_dst_set_noref(skb, dst);
1993 		}
1994 	}
1995 	return 0;
1996 }
1997 
1998 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1999 		     enum skb_drop_reason *reason)
2000 {
2001 	u32 tail_gso_size, tail_gso_segs;
2002 	struct skb_shared_info *shinfo;
2003 	const struct tcphdr *th;
2004 	struct tcphdr *thtail;
2005 	struct sk_buff *tail;
2006 	unsigned int hdrlen;
2007 	bool fragstolen;
2008 	u32 gso_segs;
2009 	u32 gso_size;
2010 	u64 limit;
2011 	int delta;
2012 
2013 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2014 	 * we can fix skb->truesize to its real value to avoid future drops.
2015 	 * This is valid because skb is not yet charged to the socket.
2016 	 * It has been noticed pure SACK packets were sometimes dropped
2017 	 * (if cooked by drivers without copybreak feature).
2018 	 */
2019 	skb_condense(skb);
2020 
2021 	skb_dst_drop(skb);
2022 
2023 	if (unlikely(tcp_checksum_complete(skb))) {
2024 		bh_unlock_sock(sk);
2025 		trace_tcp_bad_csum(skb);
2026 		*reason = SKB_DROP_REASON_TCP_CSUM;
2027 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2028 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2029 		return true;
2030 	}
2031 
2032 	/* Attempt coalescing to last skb in backlog, even if we are
2033 	 * above the limits.
2034 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2035 	 */
2036 	th = (const struct tcphdr *)skb->data;
2037 	hdrlen = th->doff * 4;
2038 
2039 	tail = sk->sk_backlog.tail;
2040 	if (!tail)
2041 		goto no_coalesce;
2042 	thtail = (struct tcphdr *)tail->data;
2043 
2044 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2045 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2046 	    ((TCP_SKB_CB(tail)->tcp_flags |
2047 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2048 	    !((TCP_SKB_CB(tail)->tcp_flags &
2049 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2050 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2051 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2052 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2053 	    thtail->doff != th->doff ||
2054 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2055 		goto no_coalesce;
2056 
2057 	__skb_pull(skb, hdrlen);
2058 
2059 	shinfo = skb_shinfo(skb);
2060 	gso_size = shinfo->gso_size ?: skb->len;
2061 	gso_segs = shinfo->gso_segs ?: 1;
2062 
2063 	shinfo = skb_shinfo(tail);
2064 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2065 	tail_gso_segs = shinfo->gso_segs ?: 1;
2066 
2067 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2068 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2069 
2070 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2071 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2072 			thtail->window = th->window;
2073 		}
2074 
2075 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2076 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2077 		 * is not entered if we append a packet with a FIN.
2078 		 * SYN, RST, URG are not present.
2079 		 * ACK is set on both packets.
2080 		 * PSH : we do not really care in TCP stack,
2081 		 *       at least for 'GRO' packets.
2082 		 */
2083 		thtail->fin |= th->fin;
2084 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2085 
2086 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2087 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2088 			tail->tstamp = skb->tstamp;
2089 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2090 		}
2091 
2092 		/* Not as strict as GRO. We only need to carry mss max value */
2093 		shinfo->gso_size = max(gso_size, tail_gso_size);
2094 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2095 
2096 		sk->sk_backlog.len += delta;
2097 		__NET_INC_STATS(sock_net(sk),
2098 				LINUX_MIB_TCPBACKLOGCOALESCE);
2099 		kfree_skb_partial(skb, fragstolen);
2100 		return false;
2101 	}
2102 	__skb_push(skb, hdrlen);
2103 
2104 no_coalesce:
2105 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2106 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2107 	 * sk_rcvbuf in normal conditions.
2108 	 */
2109 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2110 
2111 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2112 
2113 	/* Only socket owner can try to collapse/prune rx queues
2114 	 * to reduce memory overhead, so add a little headroom here.
2115 	 * Few sockets backlog are possibly concurrently non empty.
2116 	 */
2117 	limit += 64 * 1024;
2118 
2119 	limit = min_t(u64, limit, UINT_MAX);
2120 
2121 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2122 		bh_unlock_sock(sk);
2123 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2124 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2125 		return true;
2126 	}
2127 	return false;
2128 }
2129 EXPORT_SYMBOL(tcp_add_backlog);
2130 
2131 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2132 {
2133 	struct tcphdr *th = (struct tcphdr *)skb->data;
2134 
2135 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2136 }
2137 EXPORT_SYMBOL(tcp_filter);
2138 
2139 static void tcp_v4_restore_cb(struct sk_buff *skb)
2140 {
2141 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2142 		sizeof(struct inet_skb_parm));
2143 }
2144 
2145 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2146 			   const struct tcphdr *th)
2147 {
2148 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2149 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2150 	 */
2151 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2152 		sizeof(struct inet_skb_parm));
2153 	barrier();
2154 
2155 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2156 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2157 				    skb->len - th->doff * 4);
2158 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2159 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2160 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2161 	TCP_SKB_CB(skb)->sacked	 = 0;
2162 	TCP_SKB_CB(skb)->has_rxtstamp =
2163 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2164 }
2165 
2166 /*
2167  *	From tcp_input.c
2168  */
2169 
2170 int tcp_v4_rcv(struct sk_buff *skb)
2171 {
2172 	struct net *net = dev_net(skb->dev);
2173 	enum skb_drop_reason drop_reason;
2174 	int sdif = inet_sdif(skb);
2175 	int dif = inet_iif(skb);
2176 	const struct iphdr *iph;
2177 	const struct tcphdr *th;
2178 	struct sock *sk = NULL;
2179 	bool refcounted;
2180 	int ret;
2181 	u32 isn;
2182 
2183 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2184 	if (skb->pkt_type != PACKET_HOST)
2185 		goto discard_it;
2186 
2187 	/* Count it even if it's bad */
2188 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2189 
2190 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2191 		goto discard_it;
2192 
2193 	th = (const struct tcphdr *)skb->data;
2194 
2195 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2196 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2197 		goto bad_packet;
2198 	}
2199 	if (!pskb_may_pull(skb, th->doff * 4))
2200 		goto discard_it;
2201 
2202 	/* An explanation is required here, I think.
2203 	 * Packet length and doff are validated by header prediction,
2204 	 * provided case of th->doff==0 is eliminated.
2205 	 * So, we defer the checks. */
2206 
2207 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2208 		goto csum_error;
2209 
2210 	th = (const struct tcphdr *)skb->data;
2211 	iph = ip_hdr(skb);
2212 lookup:
2213 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2214 			       skb, __tcp_hdrlen(th), th->source,
2215 			       th->dest, sdif, &refcounted);
2216 	if (!sk)
2217 		goto no_tcp_socket;
2218 
2219 	if (sk->sk_state == TCP_TIME_WAIT)
2220 		goto do_time_wait;
2221 
2222 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2223 		struct request_sock *req = inet_reqsk(sk);
2224 		bool req_stolen = false;
2225 		struct sock *nsk;
2226 
2227 		sk = req->rsk_listener;
2228 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2229 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2230 		else
2231 			drop_reason = tcp_inbound_hash(sk, req, skb,
2232 						       &iph->saddr, &iph->daddr,
2233 						       AF_INET, dif, sdif);
2234 		if (unlikely(drop_reason)) {
2235 			sk_drops_add(sk, skb);
2236 			reqsk_put(req);
2237 			goto discard_it;
2238 		}
2239 		if (tcp_checksum_complete(skb)) {
2240 			reqsk_put(req);
2241 			goto csum_error;
2242 		}
2243 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2244 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2245 			if (!nsk) {
2246 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2247 				goto lookup;
2248 			}
2249 			sk = nsk;
2250 			/* reuseport_migrate_sock() has already held one sk_refcnt
2251 			 * before returning.
2252 			 */
2253 		} else {
2254 			/* We own a reference on the listener, increase it again
2255 			 * as we might lose it too soon.
2256 			 */
2257 			sock_hold(sk);
2258 		}
2259 		refcounted = true;
2260 		nsk = NULL;
2261 		if (!tcp_filter(sk, skb)) {
2262 			th = (const struct tcphdr *)skb->data;
2263 			iph = ip_hdr(skb);
2264 			tcp_v4_fill_cb(skb, iph, th);
2265 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2266 		} else {
2267 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2268 		}
2269 		if (!nsk) {
2270 			reqsk_put(req);
2271 			if (req_stolen) {
2272 				/* Another cpu got exclusive access to req
2273 				 * and created a full blown socket.
2274 				 * Try to feed this packet to this socket
2275 				 * instead of discarding it.
2276 				 */
2277 				tcp_v4_restore_cb(skb);
2278 				sock_put(sk);
2279 				goto lookup;
2280 			}
2281 			goto discard_and_relse;
2282 		}
2283 		nf_reset_ct(skb);
2284 		if (nsk == sk) {
2285 			reqsk_put(req);
2286 			tcp_v4_restore_cb(skb);
2287 		} else {
2288 			drop_reason = tcp_child_process(sk, nsk, skb);
2289 			if (drop_reason) {
2290 				enum sk_rst_reason rst_reason;
2291 
2292 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2293 				tcp_v4_send_reset(nsk, skb, rst_reason);
2294 				goto discard_and_relse;
2295 			}
2296 			sock_put(sk);
2297 			return 0;
2298 		}
2299 	}
2300 
2301 process:
2302 	if (static_branch_unlikely(&ip4_min_ttl)) {
2303 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2304 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2305 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2306 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2307 			goto discard_and_relse;
2308 		}
2309 	}
2310 
2311 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2312 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2313 		goto discard_and_relse;
2314 	}
2315 
2316 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2317 				       AF_INET, dif, sdif);
2318 	if (drop_reason)
2319 		goto discard_and_relse;
2320 
2321 	nf_reset_ct(skb);
2322 
2323 	if (tcp_filter(sk, skb)) {
2324 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2325 		goto discard_and_relse;
2326 	}
2327 	th = (const struct tcphdr *)skb->data;
2328 	iph = ip_hdr(skb);
2329 	tcp_v4_fill_cb(skb, iph, th);
2330 
2331 	skb->dev = NULL;
2332 
2333 	if (sk->sk_state == TCP_LISTEN) {
2334 		ret = tcp_v4_do_rcv(sk, skb);
2335 		goto put_and_return;
2336 	}
2337 
2338 	sk_incoming_cpu_update(sk);
2339 
2340 	bh_lock_sock_nested(sk);
2341 	tcp_segs_in(tcp_sk(sk), skb);
2342 	ret = 0;
2343 	if (!sock_owned_by_user(sk)) {
2344 		ret = tcp_v4_do_rcv(sk, skb);
2345 	} else {
2346 		if (tcp_add_backlog(sk, skb, &drop_reason))
2347 			goto discard_and_relse;
2348 	}
2349 	bh_unlock_sock(sk);
2350 
2351 put_and_return:
2352 	if (refcounted)
2353 		sock_put(sk);
2354 
2355 	return ret;
2356 
2357 no_tcp_socket:
2358 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2359 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2360 		goto discard_it;
2361 
2362 	tcp_v4_fill_cb(skb, iph, th);
2363 
2364 	if (tcp_checksum_complete(skb)) {
2365 csum_error:
2366 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2367 		trace_tcp_bad_csum(skb);
2368 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2369 bad_packet:
2370 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2371 	} else {
2372 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2373 	}
2374 
2375 discard_it:
2376 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2377 	/* Discard frame. */
2378 	sk_skb_reason_drop(sk, skb, drop_reason);
2379 	return 0;
2380 
2381 discard_and_relse:
2382 	sk_drops_add(sk, skb);
2383 	if (refcounted)
2384 		sock_put(sk);
2385 	goto discard_it;
2386 
2387 do_time_wait:
2388 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2389 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2390 		inet_twsk_put(inet_twsk(sk));
2391 		goto discard_it;
2392 	}
2393 
2394 	tcp_v4_fill_cb(skb, iph, th);
2395 
2396 	if (tcp_checksum_complete(skb)) {
2397 		inet_twsk_put(inet_twsk(sk));
2398 		goto csum_error;
2399 	}
2400 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2401 	case TCP_TW_SYN: {
2402 		struct sock *sk2 = inet_lookup_listener(net,
2403 							net->ipv4.tcp_death_row.hashinfo,
2404 							skb, __tcp_hdrlen(th),
2405 							iph->saddr, th->source,
2406 							iph->daddr, th->dest,
2407 							inet_iif(skb),
2408 							sdif);
2409 		if (sk2) {
2410 			inet_twsk_deschedule_put(inet_twsk(sk));
2411 			sk = sk2;
2412 			tcp_v4_restore_cb(skb);
2413 			refcounted = false;
2414 			__this_cpu_write(tcp_tw_isn, isn);
2415 			goto process;
2416 		}
2417 	}
2418 		/* to ACK */
2419 		fallthrough;
2420 	case TCP_TW_ACK:
2421 		tcp_v4_timewait_ack(sk, skb);
2422 		break;
2423 	case TCP_TW_RST:
2424 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2425 		inet_twsk_deschedule_put(inet_twsk(sk));
2426 		goto discard_it;
2427 	case TCP_TW_SUCCESS:;
2428 	}
2429 	goto discard_it;
2430 }
2431 
2432 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2433 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2434 	.twsk_destructor= tcp_twsk_destructor,
2435 };
2436 
2437 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2438 {
2439 	struct dst_entry *dst = skb_dst(skb);
2440 
2441 	if (dst && dst_hold_safe(dst)) {
2442 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2443 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2444 	}
2445 }
2446 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2447 
2448 const struct inet_connection_sock_af_ops ipv4_specific = {
2449 	.queue_xmit	   = ip_queue_xmit,
2450 	.send_check	   = tcp_v4_send_check,
2451 	.rebuild_header	   = inet_sk_rebuild_header,
2452 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2453 	.conn_request	   = tcp_v4_conn_request,
2454 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2455 	.net_header_len	   = sizeof(struct iphdr),
2456 	.setsockopt	   = ip_setsockopt,
2457 	.getsockopt	   = ip_getsockopt,
2458 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2459 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2460 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2461 };
2462 EXPORT_SYMBOL(ipv4_specific);
2463 
2464 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2465 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2466 #ifdef CONFIG_TCP_MD5SIG
2467 	.md5_lookup		= tcp_v4_md5_lookup,
2468 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2469 	.md5_parse		= tcp_v4_parse_md5_keys,
2470 #endif
2471 #ifdef CONFIG_TCP_AO
2472 	.ao_lookup		= tcp_v4_ao_lookup,
2473 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2474 	.ao_parse		= tcp_v4_parse_ao,
2475 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2476 #endif
2477 };
2478 #endif
2479 
2480 /* NOTE: A lot of things set to zero explicitly by call to
2481  *       sk_alloc() so need not be done here.
2482  */
2483 static int tcp_v4_init_sock(struct sock *sk)
2484 {
2485 	struct inet_connection_sock *icsk = inet_csk(sk);
2486 
2487 	tcp_init_sock(sk);
2488 
2489 	icsk->icsk_af_ops = &ipv4_specific;
2490 
2491 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2492 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2493 #endif
2494 
2495 	return 0;
2496 }
2497 
2498 #ifdef CONFIG_TCP_MD5SIG
2499 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2500 {
2501 	struct tcp_md5sig_info *md5sig;
2502 
2503 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2504 	kfree(md5sig);
2505 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2506 	tcp_md5_release_sigpool();
2507 }
2508 #endif
2509 
2510 void tcp_v4_destroy_sock(struct sock *sk)
2511 {
2512 	struct tcp_sock *tp = tcp_sk(sk);
2513 
2514 	trace_tcp_destroy_sock(sk);
2515 
2516 	tcp_clear_xmit_timers(sk);
2517 
2518 	tcp_cleanup_congestion_control(sk);
2519 
2520 	tcp_cleanup_ulp(sk);
2521 
2522 	/* Cleanup up the write buffer. */
2523 	tcp_write_queue_purge(sk);
2524 
2525 	/* Check if we want to disable active TFO */
2526 	tcp_fastopen_active_disable_ofo_check(sk);
2527 
2528 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2529 	skb_rbtree_purge(&tp->out_of_order_queue);
2530 
2531 #ifdef CONFIG_TCP_MD5SIG
2532 	/* Clean up the MD5 key list, if any */
2533 	if (tp->md5sig_info) {
2534 		struct tcp_md5sig_info *md5sig;
2535 
2536 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2537 		tcp_clear_md5_list(sk);
2538 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2539 		rcu_assign_pointer(tp->md5sig_info, NULL);
2540 	}
2541 #endif
2542 	tcp_ao_destroy_sock(sk, false);
2543 
2544 	/* Clean up a referenced TCP bind bucket. */
2545 	if (inet_csk(sk)->icsk_bind_hash)
2546 		inet_put_port(sk);
2547 
2548 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2549 
2550 	/* If socket is aborted during connect operation */
2551 	tcp_free_fastopen_req(tp);
2552 	tcp_fastopen_destroy_cipher(sk);
2553 	tcp_saved_syn_free(tp);
2554 
2555 	sk_sockets_allocated_dec(sk);
2556 }
2557 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2558 
2559 #ifdef CONFIG_PROC_FS
2560 /* Proc filesystem TCP sock list dumping. */
2561 
2562 static unsigned short seq_file_family(const struct seq_file *seq);
2563 
2564 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2565 {
2566 	unsigned short family = seq_file_family(seq);
2567 
2568 	/* AF_UNSPEC is used as a match all */
2569 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2570 		net_eq(sock_net(sk), seq_file_net(seq)));
2571 }
2572 
2573 /* Find a non empty bucket (starting from st->bucket)
2574  * and return the first sk from it.
2575  */
2576 static void *listening_get_first(struct seq_file *seq)
2577 {
2578 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2579 	struct tcp_iter_state *st = seq->private;
2580 
2581 	st->offset = 0;
2582 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2583 		struct inet_listen_hashbucket *ilb2;
2584 		struct hlist_nulls_node *node;
2585 		struct sock *sk;
2586 
2587 		ilb2 = &hinfo->lhash2[st->bucket];
2588 		if (hlist_nulls_empty(&ilb2->nulls_head))
2589 			continue;
2590 
2591 		spin_lock(&ilb2->lock);
2592 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2593 			if (seq_sk_match(seq, sk))
2594 				return sk;
2595 		}
2596 		spin_unlock(&ilb2->lock);
2597 	}
2598 
2599 	return NULL;
2600 }
2601 
2602 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2603  * If "cur" is the last one in the st->bucket,
2604  * call listening_get_first() to return the first sk of the next
2605  * non empty bucket.
2606  */
2607 static void *listening_get_next(struct seq_file *seq, void *cur)
2608 {
2609 	struct tcp_iter_state *st = seq->private;
2610 	struct inet_listen_hashbucket *ilb2;
2611 	struct hlist_nulls_node *node;
2612 	struct inet_hashinfo *hinfo;
2613 	struct sock *sk = cur;
2614 
2615 	++st->num;
2616 	++st->offset;
2617 
2618 	sk = sk_nulls_next(sk);
2619 	sk_nulls_for_each_from(sk, node) {
2620 		if (seq_sk_match(seq, sk))
2621 			return sk;
2622 	}
2623 
2624 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2625 	ilb2 = &hinfo->lhash2[st->bucket];
2626 	spin_unlock(&ilb2->lock);
2627 	++st->bucket;
2628 	return listening_get_first(seq);
2629 }
2630 
2631 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2632 {
2633 	struct tcp_iter_state *st = seq->private;
2634 	void *rc;
2635 
2636 	st->bucket = 0;
2637 	st->offset = 0;
2638 	rc = listening_get_first(seq);
2639 
2640 	while (rc && *pos) {
2641 		rc = listening_get_next(seq, rc);
2642 		--*pos;
2643 	}
2644 	return rc;
2645 }
2646 
2647 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2648 				const struct tcp_iter_state *st)
2649 {
2650 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2651 }
2652 
2653 /*
2654  * Get first established socket starting from bucket given in st->bucket.
2655  * If st->bucket is zero, the very first socket in the hash is returned.
2656  */
2657 static void *established_get_first(struct seq_file *seq)
2658 {
2659 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2660 	struct tcp_iter_state *st = seq->private;
2661 
2662 	st->offset = 0;
2663 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2664 		struct sock *sk;
2665 		struct hlist_nulls_node *node;
2666 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2667 
2668 		cond_resched();
2669 
2670 		/* Lockless fast path for the common case of empty buckets */
2671 		if (empty_bucket(hinfo, st))
2672 			continue;
2673 
2674 		spin_lock_bh(lock);
2675 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2676 			if (seq_sk_match(seq, sk))
2677 				return sk;
2678 		}
2679 		spin_unlock_bh(lock);
2680 	}
2681 
2682 	return NULL;
2683 }
2684 
2685 static void *established_get_next(struct seq_file *seq, void *cur)
2686 {
2687 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2688 	struct tcp_iter_state *st = seq->private;
2689 	struct hlist_nulls_node *node;
2690 	struct sock *sk = cur;
2691 
2692 	++st->num;
2693 	++st->offset;
2694 
2695 	sk = sk_nulls_next(sk);
2696 
2697 	sk_nulls_for_each_from(sk, node) {
2698 		if (seq_sk_match(seq, sk))
2699 			return sk;
2700 	}
2701 
2702 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2703 	++st->bucket;
2704 	return established_get_first(seq);
2705 }
2706 
2707 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2708 {
2709 	struct tcp_iter_state *st = seq->private;
2710 	void *rc;
2711 
2712 	st->bucket = 0;
2713 	rc = established_get_first(seq);
2714 
2715 	while (rc && pos) {
2716 		rc = established_get_next(seq, rc);
2717 		--pos;
2718 	}
2719 	return rc;
2720 }
2721 
2722 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2723 {
2724 	void *rc;
2725 	struct tcp_iter_state *st = seq->private;
2726 
2727 	st->state = TCP_SEQ_STATE_LISTENING;
2728 	rc	  = listening_get_idx(seq, &pos);
2729 
2730 	if (!rc) {
2731 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2732 		rc	  = established_get_idx(seq, pos);
2733 	}
2734 
2735 	return rc;
2736 }
2737 
2738 static void *tcp_seek_last_pos(struct seq_file *seq)
2739 {
2740 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2741 	struct tcp_iter_state *st = seq->private;
2742 	int bucket = st->bucket;
2743 	int offset = st->offset;
2744 	int orig_num = st->num;
2745 	void *rc = NULL;
2746 
2747 	switch (st->state) {
2748 	case TCP_SEQ_STATE_LISTENING:
2749 		if (st->bucket > hinfo->lhash2_mask)
2750 			break;
2751 		rc = listening_get_first(seq);
2752 		while (offset-- && rc && bucket == st->bucket)
2753 			rc = listening_get_next(seq, rc);
2754 		if (rc)
2755 			break;
2756 		st->bucket = 0;
2757 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2758 		fallthrough;
2759 	case TCP_SEQ_STATE_ESTABLISHED:
2760 		if (st->bucket > hinfo->ehash_mask)
2761 			break;
2762 		rc = established_get_first(seq);
2763 		while (offset-- && rc && bucket == st->bucket)
2764 			rc = established_get_next(seq, rc);
2765 	}
2766 
2767 	st->num = orig_num;
2768 
2769 	return rc;
2770 }
2771 
2772 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2773 {
2774 	struct tcp_iter_state *st = seq->private;
2775 	void *rc;
2776 
2777 	if (*pos && *pos == st->last_pos) {
2778 		rc = tcp_seek_last_pos(seq);
2779 		if (rc)
2780 			goto out;
2781 	}
2782 
2783 	st->state = TCP_SEQ_STATE_LISTENING;
2784 	st->num = 0;
2785 	st->bucket = 0;
2786 	st->offset = 0;
2787 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2788 
2789 out:
2790 	st->last_pos = *pos;
2791 	return rc;
2792 }
2793 EXPORT_SYMBOL(tcp_seq_start);
2794 
2795 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2796 {
2797 	struct tcp_iter_state *st = seq->private;
2798 	void *rc = NULL;
2799 
2800 	if (v == SEQ_START_TOKEN) {
2801 		rc = tcp_get_idx(seq, 0);
2802 		goto out;
2803 	}
2804 
2805 	switch (st->state) {
2806 	case TCP_SEQ_STATE_LISTENING:
2807 		rc = listening_get_next(seq, v);
2808 		if (!rc) {
2809 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2810 			st->bucket = 0;
2811 			st->offset = 0;
2812 			rc	  = established_get_first(seq);
2813 		}
2814 		break;
2815 	case TCP_SEQ_STATE_ESTABLISHED:
2816 		rc = established_get_next(seq, v);
2817 		break;
2818 	}
2819 out:
2820 	++*pos;
2821 	st->last_pos = *pos;
2822 	return rc;
2823 }
2824 EXPORT_SYMBOL(tcp_seq_next);
2825 
2826 void tcp_seq_stop(struct seq_file *seq, void *v)
2827 {
2828 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2829 	struct tcp_iter_state *st = seq->private;
2830 
2831 	switch (st->state) {
2832 	case TCP_SEQ_STATE_LISTENING:
2833 		if (v != SEQ_START_TOKEN)
2834 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2835 		break;
2836 	case TCP_SEQ_STATE_ESTABLISHED:
2837 		if (v)
2838 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2839 		break;
2840 	}
2841 }
2842 EXPORT_SYMBOL(tcp_seq_stop);
2843 
2844 static void get_openreq4(const struct request_sock *req,
2845 			 struct seq_file *f, int i)
2846 {
2847 	const struct inet_request_sock *ireq = inet_rsk(req);
2848 	long delta = req->rsk_timer.expires - jiffies;
2849 
2850 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2851 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2852 		i,
2853 		ireq->ir_loc_addr,
2854 		ireq->ir_num,
2855 		ireq->ir_rmt_addr,
2856 		ntohs(ireq->ir_rmt_port),
2857 		TCP_SYN_RECV,
2858 		0, 0, /* could print option size, but that is af dependent. */
2859 		1,    /* timers active (only the expire timer) */
2860 		jiffies_delta_to_clock_t(delta),
2861 		req->num_timeout,
2862 		from_kuid_munged(seq_user_ns(f),
2863 				 sock_i_uid(req->rsk_listener)),
2864 		0,  /* non standard timer */
2865 		0, /* open_requests have no inode */
2866 		0,
2867 		req);
2868 }
2869 
2870 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2871 {
2872 	int timer_active;
2873 	unsigned long timer_expires;
2874 	const struct tcp_sock *tp = tcp_sk(sk);
2875 	const struct inet_connection_sock *icsk = inet_csk(sk);
2876 	const struct inet_sock *inet = inet_sk(sk);
2877 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2878 	__be32 dest = inet->inet_daddr;
2879 	__be32 src = inet->inet_rcv_saddr;
2880 	__u16 destp = ntohs(inet->inet_dport);
2881 	__u16 srcp = ntohs(inet->inet_sport);
2882 	int rx_queue;
2883 	int state;
2884 
2885 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2886 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2887 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2888 		timer_active	= 1;
2889 		timer_expires	= icsk->icsk_timeout;
2890 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2891 		timer_active	= 4;
2892 		timer_expires	= icsk->icsk_timeout;
2893 	} else if (timer_pending(&sk->sk_timer)) {
2894 		timer_active	= 2;
2895 		timer_expires	= sk->sk_timer.expires;
2896 	} else {
2897 		timer_active	= 0;
2898 		timer_expires = jiffies;
2899 	}
2900 
2901 	state = inet_sk_state_load(sk);
2902 	if (state == TCP_LISTEN)
2903 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2904 	else
2905 		/* Because we don't lock the socket,
2906 		 * we might find a transient negative value.
2907 		 */
2908 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2909 				      READ_ONCE(tp->copied_seq), 0);
2910 
2911 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2912 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2913 		i, src, srcp, dest, destp, state,
2914 		READ_ONCE(tp->write_seq) - tp->snd_una,
2915 		rx_queue,
2916 		timer_active,
2917 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2918 		icsk->icsk_retransmits,
2919 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2920 		icsk->icsk_probes_out,
2921 		sock_i_ino(sk),
2922 		refcount_read(&sk->sk_refcnt), sk,
2923 		jiffies_to_clock_t(icsk->icsk_rto),
2924 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2925 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2926 		tcp_snd_cwnd(tp),
2927 		state == TCP_LISTEN ?
2928 		    fastopenq->max_qlen :
2929 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2930 }
2931 
2932 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2933 			       struct seq_file *f, int i)
2934 {
2935 	long delta = tw->tw_timer.expires - jiffies;
2936 	__be32 dest, src;
2937 	__u16 destp, srcp;
2938 
2939 	dest  = tw->tw_daddr;
2940 	src   = tw->tw_rcv_saddr;
2941 	destp = ntohs(tw->tw_dport);
2942 	srcp  = ntohs(tw->tw_sport);
2943 
2944 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2945 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2946 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2947 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2948 		refcount_read(&tw->tw_refcnt), tw);
2949 }
2950 
2951 #define TMPSZ 150
2952 
2953 static int tcp4_seq_show(struct seq_file *seq, void *v)
2954 {
2955 	struct tcp_iter_state *st;
2956 	struct sock *sk = v;
2957 
2958 	seq_setwidth(seq, TMPSZ - 1);
2959 	if (v == SEQ_START_TOKEN) {
2960 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2961 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2962 			   "inode");
2963 		goto out;
2964 	}
2965 	st = seq->private;
2966 
2967 	if (sk->sk_state == TCP_TIME_WAIT)
2968 		get_timewait4_sock(v, seq, st->num);
2969 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2970 		get_openreq4(v, seq, st->num);
2971 	else
2972 		get_tcp4_sock(v, seq, st->num);
2973 out:
2974 	seq_pad(seq, '\n');
2975 	return 0;
2976 }
2977 
2978 #ifdef CONFIG_BPF_SYSCALL
2979 struct bpf_tcp_iter_state {
2980 	struct tcp_iter_state state;
2981 	unsigned int cur_sk;
2982 	unsigned int end_sk;
2983 	unsigned int max_sk;
2984 	struct sock **batch;
2985 	bool st_bucket_done;
2986 };
2987 
2988 struct bpf_iter__tcp {
2989 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2990 	__bpf_md_ptr(struct sock_common *, sk_common);
2991 	uid_t uid __aligned(8);
2992 };
2993 
2994 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2995 			     struct sock_common *sk_common, uid_t uid)
2996 {
2997 	struct bpf_iter__tcp ctx;
2998 
2999 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3000 	ctx.meta = meta;
3001 	ctx.sk_common = sk_common;
3002 	ctx.uid = uid;
3003 	return bpf_iter_run_prog(prog, &ctx);
3004 }
3005 
3006 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3007 {
3008 	while (iter->cur_sk < iter->end_sk)
3009 		sock_gen_put(iter->batch[iter->cur_sk++]);
3010 }
3011 
3012 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3013 				      unsigned int new_batch_sz)
3014 {
3015 	struct sock **new_batch;
3016 
3017 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3018 			     GFP_USER | __GFP_NOWARN);
3019 	if (!new_batch)
3020 		return -ENOMEM;
3021 
3022 	bpf_iter_tcp_put_batch(iter);
3023 	kvfree(iter->batch);
3024 	iter->batch = new_batch;
3025 	iter->max_sk = new_batch_sz;
3026 
3027 	return 0;
3028 }
3029 
3030 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3031 						 struct sock *start_sk)
3032 {
3033 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3034 	struct bpf_tcp_iter_state *iter = seq->private;
3035 	struct tcp_iter_state *st = &iter->state;
3036 	struct hlist_nulls_node *node;
3037 	unsigned int expected = 1;
3038 	struct sock *sk;
3039 
3040 	sock_hold(start_sk);
3041 	iter->batch[iter->end_sk++] = start_sk;
3042 
3043 	sk = sk_nulls_next(start_sk);
3044 	sk_nulls_for_each_from(sk, node) {
3045 		if (seq_sk_match(seq, sk)) {
3046 			if (iter->end_sk < iter->max_sk) {
3047 				sock_hold(sk);
3048 				iter->batch[iter->end_sk++] = sk;
3049 			}
3050 			expected++;
3051 		}
3052 	}
3053 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3054 
3055 	return expected;
3056 }
3057 
3058 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3059 						   struct sock *start_sk)
3060 {
3061 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3062 	struct bpf_tcp_iter_state *iter = seq->private;
3063 	struct tcp_iter_state *st = &iter->state;
3064 	struct hlist_nulls_node *node;
3065 	unsigned int expected = 1;
3066 	struct sock *sk;
3067 
3068 	sock_hold(start_sk);
3069 	iter->batch[iter->end_sk++] = start_sk;
3070 
3071 	sk = sk_nulls_next(start_sk);
3072 	sk_nulls_for_each_from(sk, node) {
3073 		if (seq_sk_match(seq, sk)) {
3074 			if (iter->end_sk < iter->max_sk) {
3075 				sock_hold(sk);
3076 				iter->batch[iter->end_sk++] = sk;
3077 			}
3078 			expected++;
3079 		}
3080 	}
3081 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3082 
3083 	return expected;
3084 }
3085 
3086 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3087 {
3088 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3089 	struct bpf_tcp_iter_state *iter = seq->private;
3090 	struct tcp_iter_state *st = &iter->state;
3091 	unsigned int expected;
3092 	bool resized = false;
3093 	struct sock *sk;
3094 
3095 	/* The st->bucket is done.  Directly advance to the next
3096 	 * bucket instead of having the tcp_seek_last_pos() to skip
3097 	 * one by one in the current bucket and eventually find out
3098 	 * it has to advance to the next bucket.
3099 	 */
3100 	if (iter->st_bucket_done) {
3101 		st->offset = 0;
3102 		st->bucket++;
3103 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3104 		    st->bucket > hinfo->lhash2_mask) {
3105 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3106 			st->bucket = 0;
3107 		}
3108 	}
3109 
3110 again:
3111 	/* Get a new batch */
3112 	iter->cur_sk = 0;
3113 	iter->end_sk = 0;
3114 	iter->st_bucket_done = false;
3115 
3116 	sk = tcp_seek_last_pos(seq);
3117 	if (!sk)
3118 		return NULL; /* Done */
3119 
3120 	if (st->state == TCP_SEQ_STATE_LISTENING)
3121 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3122 	else
3123 		expected = bpf_iter_tcp_established_batch(seq, sk);
3124 
3125 	if (iter->end_sk == expected) {
3126 		iter->st_bucket_done = true;
3127 		return sk;
3128 	}
3129 
3130 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3131 		resized = true;
3132 		goto again;
3133 	}
3134 
3135 	return sk;
3136 }
3137 
3138 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3139 {
3140 	/* bpf iter does not support lseek, so it always
3141 	 * continue from where it was stop()-ped.
3142 	 */
3143 	if (*pos)
3144 		return bpf_iter_tcp_batch(seq);
3145 
3146 	return SEQ_START_TOKEN;
3147 }
3148 
3149 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3150 {
3151 	struct bpf_tcp_iter_state *iter = seq->private;
3152 	struct tcp_iter_state *st = &iter->state;
3153 	struct sock *sk;
3154 
3155 	/* Whenever seq_next() is called, the iter->cur_sk is
3156 	 * done with seq_show(), so advance to the next sk in
3157 	 * the batch.
3158 	 */
3159 	if (iter->cur_sk < iter->end_sk) {
3160 		/* Keeping st->num consistent in tcp_iter_state.
3161 		 * bpf_iter_tcp does not use st->num.
3162 		 * meta.seq_num is used instead.
3163 		 */
3164 		st->num++;
3165 		/* Move st->offset to the next sk in the bucket such that
3166 		 * the future start() will resume at st->offset in
3167 		 * st->bucket.  See tcp_seek_last_pos().
3168 		 */
3169 		st->offset++;
3170 		sock_gen_put(iter->batch[iter->cur_sk++]);
3171 	}
3172 
3173 	if (iter->cur_sk < iter->end_sk)
3174 		sk = iter->batch[iter->cur_sk];
3175 	else
3176 		sk = bpf_iter_tcp_batch(seq);
3177 
3178 	++*pos;
3179 	/* Keeping st->last_pos consistent in tcp_iter_state.
3180 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3181 	 */
3182 	st->last_pos = *pos;
3183 	return sk;
3184 }
3185 
3186 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3187 {
3188 	struct bpf_iter_meta meta;
3189 	struct bpf_prog *prog;
3190 	struct sock *sk = v;
3191 	uid_t uid;
3192 	int ret;
3193 
3194 	if (v == SEQ_START_TOKEN)
3195 		return 0;
3196 
3197 	if (sk_fullsock(sk))
3198 		lock_sock(sk);
3199 
3200 	if (unlikely(sk_unhashed(sk))) {
3201 		ret = SEQ_SKIP;
3202 		goto unlock;
3203 	}
3204 
3205 	if (sk->sk_state == TCP_TIME_WAIT) {
3206 		uid = 0;
3207 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3208 		const struct request_sock *req = v;
3209 
3210 		uid = from_kuid_munged(seq_user_ns(seq),
3211 				       sock_i_uid(req->rsk_listener));
3212 	} else {
3213 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3214 	}
3215 
3216 	meta.seq = seq;
3217 	prog = bpf_iter_get_info(&meta, false);
3218 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3219 
3220 unlock:
3221 	if (sk_fullsock(sk))
3222 		release_sock(sk);
3223 	return ret;
3224 
3225 }
3226 
3227 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3228 {
3229 	struct bpf_tcp_iter_state *iter = seq->private;
3230 	struct bpf_iter_meta meta;
3231 	struct bpf_prog *prog;
3232 
3233 	if (!v) {
3234 		meta.seq = seq;
3235 		prog = bpf_iter_get_info(&meta, true);
3236 		if (prog)
3237 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3238 	}
3239 
3240 	if (iter->cur_sk < iter->end_sk) {
3241 		bpf_iter_tcp_put_batch(iter);
3242 		iter->st_bucket_done = false;
3243 	}
3244 }
3245 
3246 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3247 	.show		= bpf_iter_tcp_seq_show,
3248 	.start		= bpf_iter_tcp_seq_start,
3249 	.next		= bpf_iter_tcp_seq_next,
3250 	.stop		= bpf_iter_tcp_seq_stop,
3251 };
3252 #endif
3253 static unsigned short seq_file_family(const struct seq_file *seq)
3254 {
3255 	const struct tcp_seq_afinfo *afinfo;
3256 
3257 #ifdef CONFIG_BPF_SYSCALL
3258 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3259 	if (seq->op == &bpf_iter_tcp_seq_ops)
3260 		return AF_UNSPEC;
3261 #endif
3262 
3263 	/* Iterated from proc fs */
3264 	afinfo = pde_data(file_inode(seq->file));
3265 	return afinfo->family;
3266 }
3267 
3268 static const struct seq_operations tcp4_seq_ops = {
3269 	.show		= tcp4_seq_show,
3270 	.start		= tcp_seq_start,
3271 	.next		= tcp_seq_next,
3272 	.stop		= tcp_seq_stop,
3273 };
3274 
3275 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3276 	.family		= AF_INET,
3277 };
3278 
3279 static int __net_init tcp4_proc_init_net(struct net *net)
3280 {
3281 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3282 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3283 		return -ENOMEM;
3284 	return 0;
3285 }
3286 
3287 static void __net_exit tcp4_proc_exit_net(struct net *net)
3288 {
3289 	remove_proc_entry("tcp", net->proc_net);
3290 }
3291 
3292 static struct pernet_operations tcp4_net_ops = {
3293 	.init = tcp4_proc_init_net,
3294 	.exit = tcp4_proc_exit_net,
3295 };
3296 
3297 int __init tcp4_proc_init(void)
3298 {
3299 	return register_pernet_subsys(&tcp4_net_ops);
3300 }
3301 
3302 void tcp4_proc_exit(void)
3303 {
3304 	unregister_pernet_subsys(&tcp4_net_ops);
3305 }
3306 #endif /* CONFIG_PROC_FS */
3307 
3308 /* @wake is one when sk_stream_write_space() calls us.
3309  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3310  * This mimics the strategy used in sock_def_write_space().
3311  */
3312 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3313 {
3314 	const struct tcp_sock *tp = tcp_sk(sk);
3315 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3316 			    READ_ONCE(tp->snd_nxt);
3317 
3318 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3319 }
3320 EXPORT_SYMBOL(tcp_stream_memory_free);
3321 
3322 struct proto tcp_prot = {
3323 	.name			= "TCP",
3324 	.owner			= THIS_MODULE,
3325 	.close			= tcp_close,
3326 	.pre_connect		= tcp_v4_pre_connect,
3327 	.connect		= tcp_v4_connect,
3328 	.disconnect		= tcp_disconnect,
3329 	.accept			= inet_csk_accept,
3330 	.ioctl			= tcp_ioctl,
3331 	.init			= tcp_v4_init_sock,
3332 	.destroy		= tcp_v4_destroy_sock,
3333 	.shutdown		= tcp_shutdown,
3334 	.setsockopt		= tcp_setsockopt,
3335 	.getsockopt		= tcp_getsockopt,
3336 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3337 	.keepalive		= tcp_set_keepalive,
3338 	.recvmsg		= tcp_recvmsg,
3339 	.sendmsg		= tcp_sendmsg,
3340 	.splice_eof		= tcp_splice_eof,
3341 	.backlog_rcv		= tcp_v4_do_rcv,
3342 	.release_cb		= tcp_release_cb,
3343 	.hash			= inet_hash,
3344 	.unhash			= inet_unhash,
3345 	.get_port		= inet_csk_get_port,
3346 	.put_port		= inet_put_port,
3347 #ifdef CONFIG_BPF_SYSCALL
3348 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3349 #endif
3350 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3351 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3352 	.stream_memory_free	= tcp_stream_memory_free,
3353 	.sockets_allocated	= &tcp_sockets_allocated,
3354 	.orphan_count		= &tcp_orphan_count,
3355 
3356 	.memory_allocated	= &tcp_memory_allocated,
3357 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3358 
3359 	.memory_pressure	= &tcp_memory_pressure,
3360 	.sysctl_mem		= sysctl_tcp_mem,
3361 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3362 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3363 	.max_header		= MAX_TCP_HEADER,
3364 	.obj_size		= sizeof(struct tcp_sock),
3365 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3366 	.twsk_prot		= &tcp_timewait_sock_ops,
3367 	.rsk_prot		= &tcp_request_sock_ops,
3368 	.h.hashinfo		= NULL,
3369 	.no_autobind		= true,
3370 	.diag_destroy		= tcp_abort,
3371 };
3372 EXPORT_SYMBOL(tcp_prot);
3373 
3374 static void __net_exit tcp_sk_exit(struct net *net)
3375 {
3376 	if (net->ipv4.tcp_congestion_control)
3377 		bpf_module_put(net->ipv4.tcp_congestion_control,
3378 			       net->ipv4.tcp_congestion_control->owner);
3379 }
3380 
3381 static void __net_init tcp_set_hashinfo(struct net *net)
3382 {
3383 	struct inet_hashinfo *hinfo;
3384 	unsigned int ehash_entries;
3385 	struct net *old_net;
3386 
3387 	if (net_eq(net, &init_net))
3388 		goto fallback;
3389 
3390 	old_net = current->nsproxy->net_ns;
3391 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3392 	if (!ehash_entries)
3393 		goto fallback;
3394 
3395 	ehash_entries = roundup_pow_of_two(ehash_entries);
3396 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3397 	if (!hinfo) {
3398 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3399 			"for a netns, fallback to the global one\n",
3400 			ehash_entries);
3401 fallback:
3402 		hinfo = &tcp_hashinfo;
3403 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3404 	}
3405 
3406 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3407 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3408 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3409 }
3410 
3411 static int __net_init tcp_sk_init(struct net *net)
3412 {
3413 	net->ipv4.sysctl_tcp_ecn = 2;
3414 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3415 
3416 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3417 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3418 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3419 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3420 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3421 
3422 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3423 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3424 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3425 
3426 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3427 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3428 	net->ipv4.sysctl_tcp_syncookies = 1;
3429 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3430 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3431 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3432 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3433 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3434 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3435 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3436 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3437 
3438 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3439 	tcp_set_hashinfo(net);
3440 
3441 	net->ipv4.sysctl_tcp_sack = 1;
3442 	net->ipv4.sysctl_tcp_window_scaling = 1;
3443 	net->ipv4.sysctl_tcp_timestamps = 1;
3444 	net->ipv4.sysctl_tcp_early_retrans = 3;
3445 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3446 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3447 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3448 	net->ipv4.sysctl_tcp_max_reordering = 300;
3449 	net->ipv4.sysctl_tcp_dsack = 1;
3450 	net->ipv4.sysctl_tcp_app_win = 31;
3451 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3452 	net->ipv4.sysctl_tcp_frto = 2;
3453 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3454 	/* This limits the percentage of the congestion window which we
3455 	 * will allow a single TSO frame to consume.  Building TSO frames
3456 	 * which are too large can cause TCP streams to be bursty.
3457 	 */
3458 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3459 	/* Default TSQ limit of 16 TSO segments */
3460 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3461 
3462 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3463 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3464 
3465 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3466 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3467 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3468 	net->ipv4.sysctl_tcp_autocorking = 1;
3469 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3470 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3471 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3472 	if (net != &init_net) {
3473 		memcpy(net->ipv4.sysctl_tcp_rmem,
3474 		       init_net.ipv4.sysctl_tcp_rmem,
3475 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3476 		memcpy(net->ipv4.sysctl_tcp_wmem,
3477 		       init_net.ipv4.sysctl_tcp_wmem,
3478 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3479 	}
3480 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3481 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3482 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3483 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3484 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3485 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3486 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3487 
3488 	/* Set default values for PLB */
3489 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3490 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3491 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3492 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3493 	/* Default congestion threshold for PLB to mark a round is 50% */
3494 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3495 
3496 	/* Reno is always built in */
3497 	if (!net_eq(net, &init_net) &&
3498 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3499 			       init_net.ipv4.tcp_congestion_control->owner))
3500 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3501 	else
3502 		net->ipv4.tcp_congestion_control = &tcp_reno;
3503 
3504 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3505 	net->ipv4.sysctl_tcp_shrink_window = 0;
3506 
3507 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3508 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3509 
3510 	return 0;
3511 }
3512 
3513 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3514 {
3515 	struct net *net;
3516 
3517 	tcp_twsk_purge(net_exit_list);
3518 
3519 	list_for_each_entry(net, net_exit_list, exit_list) {
3520 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3521 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3522 		tcp_fastopen_ctx_destroy(net);
3523 	}
3524 }
3525 
3526 static struct pernet_operations __net_initdata tcp_sk_ops = {
3527        .init	   = tcp_sk_init,
3528        .exit	   = tcp_sk_exit,
3529        .exit_batch = tcp_sk_exit_batch,
3530 };
3531 
3532 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3533 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3534 		     struct sock_common *sk_common, uid_t uid)
3535 
3536 #define INIT_BATCH_SZ 16
3537 
3538 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3539 {
3540 	struct bpf_tcp_iter_state *iter = priv_data;
3541 	int err;
3542 
3543 	err = bpf_iter_init_seq_net(priv_data, aux);
3544 	if (err)
3545 		return err;
3546 
3547 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3548 	if (err) {
3549 		bpf_iter_fini_seq_net(priv_data);
3550 		return err;
3551 	}
3552 
3553 	return 0;
3554 }
3555 
3556 static void bpf_iter_fini_tcp(void *priv_data)
3557 {
3558 	struct bpf_tcp_iter_state *iter = priv_data;
3559 
3560 	bpf_iter_fini_seq_net(priv_data);
3561 	kvfree(iter->batch);
3562 }
3563 
3564 static const struct bpf_iter_seq_info tcp_seq_info = {
3565 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3566 	.init_seq_private	= bpf_iter_init_tcp,
3567 	.fini_seq_private	= bpf_iter_fini_tcp,
3568 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3569 };
3570 
3571 static const struct bpf_func_proto *
3572 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3573 			    const struct bpf_prog *prog)
3574 {
3575 	switch (func_id) {
3576 	case BPF_FUNC_setsockopt:
3577 		return &bpf_sk_setsockopt_proto;
3578 	case BPF_FUNC_getsockopt:
3579 		return &bpf_sk_getsockopt_proto;
3580 	default:
3581 		return NULL;
3582 	}
3583 }
3584 
3585 static struct bpf_iter_reg tcp_reg_info = {
3586 	.target			= "tcp",
3587 	.ctx_arg_info_size	= 1,
3588 	.ctx_arg_info		= {
3589 		{ offsetof(struct bpf_iter__tcp, sk_common),
3590 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3591 	},
3592 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3593 	.seq_info		= &tcp_seq_info,
3594 };
3595 
3596 static void __init bpf_iter_register(void)
3597 {
3598 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3599 	if (bpf_iter_reg_target(&tcp_reg_info))
3600 		pr_warn("Warning: could not register bpf iterator tcp\n");
3601 }
3602 
3603 #endif
3604 
3605 void __init tcp_v4_init(void)
3606 {
3607 	int cpu, res;
3608 
3609 	for_each_possible_cpu(cpu) {
3610 		struct sock *sk;
3611 
3612 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3613 					   IPPROTO_TCP, &init_net);
3614 		if (res)
3615 			panic("Failed to create the TCP control socket.\n");
3616 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3617 
3618 		/* Please enforce IP_DF and IPID==0 for RST and
3619 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3620 		 */
3621 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3622 
3623 		sk->sk_clockid = CLOCK_MONOTONIC;
3624 
3625 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3626 	}
3627 	if (register_pernet_subsys(&tcp_sk_ops))
3628 		panic("Failed to create the TCP control socket.\n");
3629 
3630 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3631 	bpf_iter_register();
3632 #endif
3633 }
3634