xref: /linux/net/ipv4/tcp_ipv4.c (revision 712676ea2bb3882a852bcf49862c4247317fc9b2)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74 
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 
83 #include <crypto/hash.h>
84 #include <linux/scatterlist.h>
85 
86 #include <trace/events/tcp.h>
87 
88 #ifdef CONFIG_TCP_MD5SIG
89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
90 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 #endif
92 
93 struct inet_hashinfo tcp_hashinfo;
94 EXPORT_SYMBOL(tcp_hashinfo);
95 
96 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
97 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
98 };
99 
100 static DEFINE_MUTEX(tcp_exit_batch_mutex);
101 
102 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
103 {
104 	return secure_tcp_seq(ip_hdr(skb)->daddr,
105 			      ip_hdr(skb)->saddr,
106 			      tcp_hdr(skb)->dest,
107 			      tcp_hdr(skb)->source);
108 }
109 
110 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
111 {
112 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
113 }
114 
115 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
116 {
117 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
118 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
119 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
120 	struct tcp_sock *tp = tcp_sk(sk);
121 	int ts_recent_stamp;
122 
123 	if (reuse == 2) {
124 		/* Still does not detect *everything* that goes through
125 		 * lo, since we require a loopback src or dst address
126 		 * or direct binding to 'lo' interface.
127 		 */
128 		bool loopback = false;
129 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
130 			loopback = true;
131 #if IS_ENABLED(CONFIG_IPV6)
132 		if (tw->tw_family == AF_INET6) {
133 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
134 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
135 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
136 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
137 				loopback = true;
138 		} else
139 #endif
140 		{
141 			if (ipv4_is_loopback(tw->tw_daddr) ||
142 			    ipv4_is_loopback(tw->tw_rcv_saddr))
143 				loopback = true;
144 		}
145 		if (!loopback)
146 			reuse = 0;
147 	}
148 
149 	/* With PAWS, it is safe from the viewpoint
150 	   of data integrity. Even without PAWS it is safe provided sequence
151 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
152 
153 	   Actually, the idea is close to VJ's one, only timestamp cache is
154 	   held not per host, but per port pair and TW bucket is used as state
155 	   holder.
156 
157 	   If TW bucket has been already destroyed we fall back to VJ's scheme
158 	   and use initial timestamp retrieved from peer table.
159 	 */
160 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
161 	if (ts_recent_stamp &&
162 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
163 					    ts_recent_stamp)))) {
164 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
165 		 * and releasing the bucket lock.
166 		 */
167 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
168 			return 0;
169 
170 		/* In case of repair and re-using TIME-WAIT sockets we still
171 		 * want to be sure that it is safe as above but honor the
172 		 * sequence numbers and time stamps set as part of the repair
173 		 * process.
174 		 *
175 		 * Without this check re-using a TIME-WAIT socket with TCP
176 		 * repair would accumulate a -1 on the repair assigned
177 		 * sequence number. The first time it is reused the sequence
178 		 * is -1, the second time -2, etc. This fixes that issue
179 		 * without appearing to create any others.
180 		 */
181 		if (likely(!tp->repair)) {
182 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
183 
184 			if (!seq)
185 				seq = 1;
186 			WRITE_ONCE(tp->write_seq, seq);
187 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
188 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
189 		}
190 
191 		return 1;
192 	}
193 
194 	return 0;
195 }
196 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
197 
198 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
199 			      int addr_len)
200 {
201 	/* This check is replicated from tcp_v4_connect() and intended to
202 	 * prevent BPF program called below from accessing bytes that are out
203 	 * of the bound specified by user in addr_len.
204 	 */
205 	if (addr_len < sizeof(struct sockaddr_in))
206 		return -EINVAL;
207 
208 	sock_owned_by_me(sk);
209 
210 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
211 }
212 
213 /* This will initiate an outgoing connection. */
214 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
215 {
216 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
217 	struct inet_timewait_death_row *tcp_death_row;
218 	struct inet_sock *inet = inet_sk(sk);
219 	struct tcp_sock *tp = tcp_sk(sk);
220 	struct ip_options_rcu *inet_opt;
221 	struct net *net = sock_net(sk);
222 	__be16 orig_sport, orig_dport;
223 	__be32 daddr, nexthop;
224 	struct flowi4 *fl4;
225 	struct rtable *rt;
226 	int err;
227 
228 	if (addr_len < sizeof(struct sockaddr_in))
229 		return -EINVAL;
230 
231 	if (usin->sin_family != AF_INET)
232 		return -EAFNOSUPPORT;
233 
234 	nexthop = daddr = usin->sin_addr.s_addr;
235 	inet_opt = rcu_dereference_protected(inet->inet_opt,
236 					     lockdep_sock_is_held(sk));
237 	if (inet_opt && inet_opt->opt.srr) {
238 		if (!daddr)
239 			return -EINVAL;
240 		nexthop = inet_opt->opt.faddr;
241 	}
242 
243 	orig_sport = inet->inet_sport;
244 	orig_dport = usin->sin_port;
245 	fl4 = &inet->cork.fl.u.ip4;
246 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
247 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
248 			      orig_dport, sk);
249 	if (IS_ERR(rt)) {
250 		err = PTR_ERR(rt);
251 		if (err == -ENETUNREACH)
252 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
253 		return err;
254 	}
255 
256 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
257 		ip_rt_put(rt);
258 		return -ENETUNREACH;
259 	}
260 
261 	if (!inet_opt || !inet_opt->opt.srr)
262 		daddr = fl4->daddr;
263 
264 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
265 
266 	if (!inet->inet_saddr) {
267 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
268 		if (err) {
269 			ip_rt_put(rt);
270 			return err;
271 		}
272 	} else {
273 		sk_rcv_saddr_set(sk, inet->inet_saddr);
274 	}
275 
276 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
277 		/* Reset inherited state */
278 		tp->rx_opt.ts_recent	   = 0;
279 		tp->rx_opt.ts_recent_stamp = 0;
280 		if (likely(!tp->repair))
281 			WRITE_ONCE(tp->write_seq, 0);
282 	}
283 
284 	inet->inet_dport = usin->sin_port;
285 	sk_daddr_set(sk, daddr);
286 
287 	inet_csk(sk)->icsk_ext_hdr_len = 0;
288 	if (inet_opt)
289 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
290 
291 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
292 
293 	/* Socket identity is still unknown (sport may be zero).
294 	 * However we set state to SYN-SENT and not releasing socket
295 	 * lock select source port, enter ourselves into the hash tables and
296 	 * complete initialization after this.
297 	 */
298 	tcp_set_state(sk, TCP_SYN_SENT);
299 	err = inet_hash_connect(tcp_death_row, sk);
300 	if (err)
301 		goto failure;
302 
303 	sk_set_txhash(sk);
304 
305 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
306 			       inet->inet_sport, inet->inet_dport, sk);
307 	if (IS_ERR(rt)) {
308 		err = PTR_ERR(rt);
309 		rt = NULL;
310 		goto failure;
311 	}
312 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
313 	/* OK, now commit destination to socket.  */
314 	sk->sk_gso_type = SKB_GSO_TCPV4;
315 	sk_setup_caps(sk, &rt->dst);
316 	rt = NULL;
317 
318 	if (likely(!tp->repair)) {
319 		if (!tp->write_seq)
320 			WRITE_ONCE(tp->write_seq,
321 				   secure_tcp_seq(inet->inet_saddr,
322 						  inet->inet_daddr,
323 						  inet->inet_sport,
324 						  usin->sin_port));
325 		WRITE_ONCE(tp->tsoffset,
326 			   secure_tcp_ts_off(net, inet->inet_saddr,
327 					     inet->inet_daddr));
328 	}
329 
330 	atomic_set(&inet->inet_id, get_random_u16());
331 
332 	if (tcp_fastopen_defer_connect(sk, &err))
333 		return err;
334 	if (err)
335 		goto failure;
336 
337 	err = tcp_connect(sk);
338 
339 	if (err)
340 		goto failure;
341 
342 	return 0;
343 
344 failure:
345 	/*
346 	 * This unhashes the socket and releases the local port,
347 	 * if necessary.
348 	 */
349 	tcp_set_state(sk, TCP_CLOSE);
350 	inet_bhash2_reset_saddr(sk);
351 	ip_rt_put(rt);
352 	sk->sk_route_caps = 0;
353 	inet->inet_dport = 0;
354 	return err;
355 }
356 EXPORT_SYMBOL(tcp_v4_connect);
357 
358 /*
359  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
360  * It can be called through tcp_release_cb() if socket was owned by user
361  * at the time tcp_v4_err() was called to handle ICMP message.
362  */
363 void tcp_v4_mtu_reduced(struct sock *sk)
364 {
365 	struct inet_sock *inet = inet_sk(sk);
366 	struct dst_entry *dst;
367 	u32 mtu;
368 
369 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
370 		return;
371 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
372 	dst = inet_csk_update_pmtu(sk, mtu);
373 	if (!dst)
374 		return;
375 
376 	/* Something is about to be wrong... Remember soft error
377 	 * for the case, if this connection will not able to recover.
378 	 */
379 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
380 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
381 
382 	mtu = dst_mtu(dst);
383 
384 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
385 	    ip_sk_accept_pmtu(sk) &&
386 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
387 		tcp_sync_mss(sk, mtu);
388 
389 		/* Resend the TCP packet because it's
390 		 * clear that the old packet has been
391 		 * dropped. This is the new "fast" path mtu
392 		 * discovery.
393 		 */
394 		tcp_simple_retransmit(sk);
395 	} /* else let the usual retransmit timer handle it */
396 }
397 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
398 
399 static void do_redirect(struct sk_buff *skb, struct sock *sk)
400 {
401 	struct dst_entry *dst = __sk_dst_check(sk, 0);
402 
403 	if (dst)
404 		dst->ops->redirect(dst, sk, skb);
405 }
406 
407 
408 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
409 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
410 {
411 	struct request_sock *req = inet_reqsk(sk);
412 	struct net *net = sock_net(sk);
413 
414 	/* ICMPs are not backlogged, hence we cannot get
415 	 * an established socket here.
416 	 */
417 	if (seq != tcp_rsk(req)->snt_isn) {
418 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
419 	} else if (abort) {
420 		/*
421 		 * Still in SYN_RECV, just remove it silently.
422 		 * There is no good way to pass the error to the newly
423 		 * created socket, and POSIX does not want network
424 		 * errors returned from accept().
425 		 */
426 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
427 		tcp_listendrop(req->rsk_listener);
428 	}
429 	reqsk_put(req);
430 }
431 EXPORT_SYMBOL(tcp_req_err);
432 
433 /* TCP-LD (RFC 6069) logic */
434 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
435 {
436 	struct inet_connection_sock *icsk = inet_csk(sk);
437 	struct tcp_sock *tp = tcp_sk(sk);
438 	struct sk_buff *skb;
439 	s32 remaining;
440 	u32 delta_us;
441 
442 	if (sock_owned_by_user(sk))
443 		return;
444 
445 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
446 	    !icsk->icsk_backoff)
447 		return;
448 
449 	skb = tcp_rtx_queue_head(sk);
450 	if (WARN_ON_ONCE(!skb))
451 		return;
452 
453 	icsk->icsk_backoff--;
454 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
455 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
456 
457 	tcp_mstamp_refresh(tp);
458 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
459 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
460 
461 	if (remaining > 0) {
462 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
463 					  remaining, TCP_RTO_MAX);
464 	} else {
465 		/* RTO revert clocked out retransmission.
466 		 * Will retransmit now.
467 		 */
468 		tcp_retransmit_timer(sk);
469 	}
470 }
471 EXPORT_SYMBOL(tcp_ld_RTO_revert);
472 
473 /*
474  * This routine is called by the ICMP module when it gets some
475  * sort of error condition.  If err < 0 then the socket should
476  * be closed and the error returned to the user.  If err > 0
477  * it's just the icmp type << 8 | icmp code.  After adjustment
478  * header points to the first 8 bytes of the tcp header.  We need
479  * to find the appropriate port.
480  *
481  * The locking strategy used here is very "optimistic". When
482  * someone else accesses the socket the ICMP is just dropped
483  * and for some paths there is no check at all.
484  * A more general error queue to queue errors for later handling
485  * is probably better.
486  *
487  */
488 
489 int tcp_v4_err(struct sk_buff *skb, u32 info)
490 {
491 	const struct iphdr *iph = (const struct iphdr *)skb->data;
492 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
493 	struct tcp_sock *tp;
494 	const int type = icmp_hdr(skb)->type;
495 	const int code = icmp_hdr(skb)->code;
496 	struct sock *sk;
497 	struct request_sock *fastopen;
498 	u32 seq, snd_una;
499 	int err;
500 	struct net *net = dev_net(skb->dev);
501 
502 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
503 				       iph->daddr, th->dest, iph->saddr,
504 				       ntohs(th->source), inet_iif(skb), 0);
505 	if (!sk) {
506 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
507 		return -ENOENT;
508 	}
509 	if (sk->sk_state == TCP_TIME_WAIT) {
510 		/* To increase the counter of ignored icmps for TCP-AO */
511 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
512 		inet_twsk_put(inet_twsk(sk));
513 		return 0;
514 	}
515 	seq = ntohl(th->seq);
516 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
517 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
518 				     type == ICMP_TIME_EXCEEDED ||
519 				     (type == ICMP_DEST_UNREACH &&
520 				      (code == ICMP_NET_UNREACH ||
521 				       code == ICMP_HOST_UNREACH)));
522 		return 0;
523 	}
524 
525 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
526 		sock_put(sk);
527 		return 0;
528 	}
529 
530 	bh_lock_sock(sk);
531 	/* If too many ICMPs get dropped on busy
532 	 * servers this needs to be solved differently.
533 	 * We do take care of PMTU discovery (RFC1191) special case :
534 	 * we can receive locally generated ICMP messages while socket is held.
535 	 */
536 	if (sock_owned_by_user(sk)) {
537 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
538 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
539 	}
540 	if (sk->sk_state == TCP_CLOSE)
541 		goto out;
542 
543 	if (static_branch_unlikely(&ip4_min_ttl)) {
544 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
545 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
546 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
547 			goto out;
548 		}
549 	}
550 
551 	tp = tcp_sk(sk);
552 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
553 	fastopen = rcu_dereference(tp->fastopen_rsk);
554 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
555 	if (sk->sk_state != TCP_LISTEN &&
556 	    !between(seq, snd_una, tp->snd_nxt)) {
557 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
558 		goto out;
559 	}
560 
561 	switch (type) {
562 	case ICMP_REDIRECT:
563 		if (!sock_owned_by_user(sk))
564 			do_redirect(skb, sk);
565 		goto out;
566 	case ICMP_SOURCE_QUENCH:
567 		/* Just silently ignore these. */
568 		goto out;
569 	case ICMP_PARAMETERPROB:
570 		err = EPROTO;
571 		break;
572 	case ICMP_DEST_UNREACH:
573 		if (code > NR_ICMP_UNREACH)
574 			goto out;
575 
576 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
577 			/* We are not interested in TCP_LISTEN and open_requests
578 			 * (SYN-ACKs send out by Linux are always <576bytes so
579 			 * they should go through unfragmented).
580 			 */
581 			if (sk->sk_state == TCP_LISTEN)
582 				goto out;
583 
584 			WRITE_ONCE(tp->mtu_info, info);
585 			if (!sock_owned_by_user(sk)) {
586 				tcp_v4_mtu_reduced(sk);
587 			} else {
588 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
589 					sock_hold(sk);
590 			}
591 			goto out;
592 		}
593 
594 		err = icmp_err_convert[code].errno;
595 		/* check if this ICMP message allows revert of backoff.
596 		 * (see RFC 6069)
597 		 */
598 		if (!fastopen &&
599 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
600 			tcp_ld_RTO_revert(sk, seq);
601 		break;
602 	case ICMP_TIME_EXCEEDED:
603 		err = EHOSTUNREACH;
604 		break;
605 	default:
606 		goto out;
607 	}
608 
609 	switch (sk->sk_state) {
610 	case TCP_SYN_SENT:
611 	case TCP_SYN_RECV:
612 		/* Only in fast or simultaneous open. If a fast open socket is
613 		 * already accepted it is treated as a connected one below.
614 		 */
615 		if (fastopen && !fastopen->sk)
616 			break;
617 
618 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
619 
620 		if (!sock_owned_by_user(sk))
621 			tcp_done_with_error(sk, err);
622 		else
623 			WRITE_ONCE(sk->sk_err_soft, err);
624 		goto out;
625 	}
626 
627 	/* If we've already connected we will keep trying
628 	 * until we time out, or the user gives up.
629 	 *
630 	 * rfc1122 4.2.3.9 allows to consider as hard errors
631 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
632 	 * but it is obsoleted by pmtu discovery).
633 	 *
634 	 * Note, that in modern internet, where routing is unreliable
635 	 * and in each dark corner broken firewalls sit, sending random
636 	 * errors ordered by their masters even this two messages finally lose
637 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
638 	 *
639 	 * Now we are in compliance with RFCs.
640 	 *							--ANK (980905)
641 	 */
642 
643 	if (!sock_owned_by_user(sk) &&
644 	    inet_test_bit(RECVERR, sk)) {
645 		WRITE_ONCE(sk->sk_err, err);
646 		sk_error_report(sk);
647 	} else	{ /* Only an error on timeout */
648 		WRITE_ONCE(sk->sk_err_soft, err);
649 	}
650 
651 out:
652 	bh_unlock_sock(sk);
653 	sock_put(sk);
654 	return 0;
655 }
656 
657 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
658 {
659 	struct tcphdr *th = tcp_hdr(skb);
660 
661 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
662 	skb->csum_start = skb_transport_header(skb) - skb->head;
663 	skb->csum_offset = offsetof(struct tcphdr, check);
664 }
665 
666 /* This routine computes an IPv4 TCP checksum. */
667 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
668 {
669 	const struct inet_sock *inet = inet_sk(sk);
670 
671 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
672 }
673 EXPORT_SYMBOL(tcp_v4_send_check);
674 
675 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
676 
677 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
678 				 const struct tcp_ao_hdr *aoh,
679 				 struct ip_reply_arg *arg, struct tcphdr *reply,
680 				 __be32 reply_options[REPLY_OPTIONS_LEN])
681 {
682 #ifdef CONFIG_TCP_AO
683 	int sdif = tcp_v4_sdif(skb);
684 	int dif = inet_iif(skb);
685 	int l3index = sdif ? dif : 0;
686 	bool allocated_traffic_key;
687 	struct tcp_ao_key *key;
688 	char *traffic_key;
689 	bool drop = true;
690 	u32 ao_sne = 0;
691 	u8 keyid;
692 
693 	rcu_read_lock();
694 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
695 				 &key, &traffic_key, &allocated_traffic_key,
696 				 &keyid, &ao_sne))
697 		goto out;
698 
699 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
700 				 (aoh->rnext_keyid << 8) | keyid);
701 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
702 	reply->doff = arg->iov[0].iov_len / 4;
703 
704 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
705 			    key, traffic_key,
706 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
707 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
708 			    reply, ao_sne))
709 		goto out;
710 	drop = false;
711 out:
712 	rcu_read_unlock();
713 	if (allocated_traffic_key)
714 		kfree(traffic_key);
715 	return drop;
716 #else
717 	return true;
718 #endif
719 }
720 
721 /*
722  *	This routine will send an RST to the other tcp.
723  *
724  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
725  *		      for reset.
726  *	Answer: if a packet caused RST, it is not for a socket
727  *		existing in our system, if it is matched to a socket,
728  *		it is just duplicate segment or bug in other side's TCP.
729  *		So that we build reply only basing on parameters
730  *		arrived with segment.
731  *	Exception: precedence violation. We do not implement it in any case.
732  */
733 
734 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
735 			      enum sk_rst_reason reason)
736 {
737 	const struct tcphdr *th = tcp_hdr(skb);
738 	struct {
739 		struct tcphdr th;
740 		__be32 opt[REPLY_OPTIONS_LEN];
741 	} rep;
742 	const __u8 *md5_hash_location = NULL;
743 	const struct tcp_ao_hdr *aoh;
744 	struct ip_reply_arg arg;
745 #ifdef CONFIG_TCP_MD5SIG
746 	struct tcp_md5sig_key *key = NULL;
747 	unsigned char newhash[16];
748 	struct sock *sk1 = NULL;
749 	int genhash;
750 #endif
751 	u64 transmit_time = 0;
752 	struct sock *ctl_sk;
753 	struct net *net;
754 	u32 txhash = 0;
755 
756 	/* Never send a reset in response to a reset. */
757 	if (th->rst)
758 		return;
759 
760 	/* If sk not NULL, it means we did a successful lookup and incoming
761 	 * route had to be correct. prequeue might have dropped our dst.
762 	 */
763 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
764 		return;
765 
766 	/* Swap the send and the receive. */
767 	memset(&rep, 0, sizeof(rep));
768 	rep.th.dest   = th->source;
769 	rep.th.source = th->dest;
770 	rep.th.doff   = sizeof(struct tcphdr) / 4;
771 	rep.th.rst    = 1;
772 
773 	if (th->ack) {
774 		rep.th.seq = th->ack_seq;
775 	} else {
776 		rep.th.ack = 1;
777 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
778 				       skb->len - (th->doff << 2));
779 	}
780 
781 	memset(&arg, 0, sizeof(arg));
782 	arg.iov[0].iov_base = (unsigned char *)&rep;
783 	arg.iov[0].iov_len  = sizeof(rep.th);
784 
785 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
786 
787 	/* Invalid TCP option size or twice included auth */
788 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
789 		return;
790 
791 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
792 		return;
793 
794 #ifdef CONFIG_TCP_MD5SIG
795 	rcu_read_lock();
796 	if (sk && sk_fullsock(sk)) {
797 		const union tcp_md5_addr *addr;
798 		int l3index;
799 
800 		/* sdif set, means packet ingressed via a device
801 		 * in an L3 domain and inet_iif is set to it.
802 		 */
803 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
804 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
805 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
806 	} else if (md5_hash_location) {
807 		const union tcp_md5_addr *addr;
808 		int sdif = tcp_v4_sdif(skb);
809 		int dif = inet_iif(skb);
810 		int l3index;
811 
812 		/*
813 		 * active side is lost. Try to find listening socket through
814 		 * source port, and then find md5 key through listening socket.
815 		 * we are not loose security here:
816 		 * Incoming packet is checked with md5 hash with finding key,
817 		 * no RST generated if md5 hash doesn't match.
818 		 */
819 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
820 					     NULL, 0, ip_hdr(skb)->saddr,
821 					     th->source, ip_hdr(skb)->daddr,
822 					     ntohs(th->source), dif, sdif);
823 		/* don't send rst if it can't find key */
824 		if (!sk1)
825 			goto out;
826 
827 		/* sdif set, means packet ingressed via a device
828 		 * in an L3 domain and dif is set to it.
829 		 */
830 		l3index = sdif ? dif : 0;
831 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
832 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
833 		if (!key)
834 			goto out;
835 
836 
837 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
838 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
839 			goto out;
840 
841 	}
842 
843 	if (key) {
844 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
845 				   (TCPOPT_NOP << 16) |
846 				   (TCPOPT_MD5SIG << 8) |
847 				   TCPOLEN_MD5SIG);
848 		/* Update length and the length the header thinks exists */
849 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
850 		rep.th.doff = arg.iov[0].iov_len / 4;
851 
852 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
853 				     key, ip_hdr(skb)->saddr,
854 				     ip_hdr(skb)->daddr, &rep.th);
855 	}
856 #endif
857 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
858 	if (rep.opt[0] == 0) {
859 		__be32 mrst = mptcp_reset_option(skb);
860 
861 		if (mrst) {
862 			rep.opt[0] = mrst;
863 			arg.iov[0].iov_len += sizeof(mrst);
864 			rep.th.doff = arg.iov[0].iov_len / 4;
865 		}
866 	}
867 
868 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
869 				      ip_hdr(skb)->saddr, /* XXX */
870 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
871 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
872 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
873 
874 	/* When socket is gone, all binding information is lost.
875 	 * routing might fail in this case. No choice here, if we choose to force
876 	 * input interface, we will misroute in case of asymmetric route.
877 	 */
878 	if (sk)
879 		arg.bound_dev_if = sk->sk_bound_dev_if;
880 
881 	trace_tcp_send_reset(sk, skb, reason);
882 
883 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
884 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
885 
886 	arg.tos = ip_hdr(skb)->tos;
887 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
888 	local_bh_disable();
889 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
890 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
891 
892 	sock_net_set(ctl_sk, net);
893 	if (sk) {
894 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
896 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
898 		transmit_time = tcp_transmit_time(sk);
899 		xfrm_sk_clone_policy(ctl_sk, sk);
900 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
901 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
902 	} else {
903 		ctl_sk->sk_mark = 0;
904 		ctl_sk->sk_priority = 0;
905 	}
906 	ip_send_unicast_reply(ctl_sk,
907 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
908 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
909 			      &arg, arg.iov[0].iov_len,
910 			      transmit_time, txhash);
911 
912 	xfrm_sk_free_policy(ctl_sk);
913 	sock_net_set(ctl_sk, &init_net);
914 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
915 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
916 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
917 	local_bh_enable();
918 
919 #ifdef CONFIG_TCP_MD5SIG
920 out:
921 	rcu_read_unlock();
922 #endif
923 }
924 
925 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
926    outside socket context is ugly, certainly. What can I do?
927  */
928 
929 static void tcp_v4_send_ack(const struct sock *sk,
930 			    struct sk_buff *skb, u32 seq, u32 ack,
931 			    u32 win, u32 tsval, u32 tsecr, int oif,
932 			    struct tcp_key *key,
933 			    int reply_flags, u8 tos, u32 txhash)
934 {
935 	const struct tcphdr *th = tcp_hdr(skb);
936 	struct {
937 		struct tcphdr th;
938 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
939 	} rep;
940 	struct net *net = sock_net(sk);
941 	struct ip_reply_arg arg;
942 	struct sock *ctl_sk;
943 	u64 transmit_time;
944 
945 	memset(&rep.th, 0, sizeof(struct tcphdr));
946 	memset(&arg, 0, sizeof(arg));
947 
948 	arg.iov[0].iov_base = (unsigned char *)&rep;
949 	arg.iov[0].iov_len  = sizeof(rep.th);
950 	if (tsecr) {
951 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
952 				   (TCPOPT_TIMESTAMP << 8) |
953 				   TCPOLEN_TIMESTAMP);
954 		rep.opt[1] = htonl(tsval);
955 		rep.opt[2] = htonl(tsecr);
956 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
957 	}
958 
959 	/* Swap the send and the receive. */
960 	rep.th.dest    = th->source;
961 	rep.th.source  = th->dest;
962 	rep.th.doff    = arg.iov[0].iov_len / 4;
963 	rep.th.seq     = htonl(seq);
964 	rep.th.ack_seq = htonl(ack);
965 	rep.th.ack     = 1;
966 	rep.th.window  = htons(win);
967 
968 #ifdef CONFIG_TCP_MD5SIG
969 	if (tcp_key_is_md5(key)) {
970 		int offset = (tsecr) ? 3 : 0;
971 
972 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
973 					  (TCPOPT_NOP << 16) |
974 					  (TCPOPT_MD5SIG << 8) |
975 					  TCPOLEN_MD5SIG);
976 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
977 		rep.th.doff = arg.iov[0].iov_len/4;
978 
979 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
980 				    key->md5_key, ip_hdr(skb)->saddr,
981 				    ip_hdr(skb)->daddr, &rep.th);
982 	}
983 #endif
984 #ifdef CONFIG_TCP_AO
985 	if (tcp_key_is_ao(key)) {
986 		int offset = (tsecr) ? 3 : 0;
987 
988 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
989 					  (tcp_ao_len(key->ao_key) << 16) |
990 					  (key->ao_key->sndid << 8) |
991 					  key->rcv_next);
992 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
993 		rep.th.doff = arg.iov[0].iov_len / 4;
994 
995 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
996 				key->ao_key, key->traffic_key,
997 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
998 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
999 				&rep.th, key->sne);
1000 	}
1001 #endif
1002 	arg.flags = reply_flags;
1003 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1004 				      ip_hdr(skb)->saddr, /* XXX */
1005 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1006 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1007 	if (oif)
1008 		arg.bound_dev_if = oif;
1009 	arg.tos = tos;
1010 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1011 	local_bh_disable();
1012 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1013 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1014 	sock_net_set(ctl_sk, net);
1015 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1016 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1017 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1018 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1019 	transmit_time = tcp_transmit_time(sk);
1020 	ip_send_unicast_reply(ctl_sk,
1021 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1022 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1023 			      &arg, arg.iov[0].iov_len,
1024 			      transmit_time, txhash);
1025 
1026 	sock_net_set(ctl_sk, &init_net);
1027 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1028 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1029 	local_bh_enable();
1030 }
1031 
1032 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1033 {
1034 	struct inet_timewait_sock *tw = inet_twsk(sk);
1035 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1036 	struct tcp_key key = {};
1037 #ifdef CONFIG_TCP_AO
1038 	struct tcp_ao_info *ao_info;
1039 
1040 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1041 		/* FIXME: the segment to-be-acked is not verified yet */
1042 		ao_info = rcu_dereference(tcptw->ao_info);
1043 		if (ao_info) {
1044 			const struct tcp_ao_hdr *aoh;
1045 
1046 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1047 				inet_twsk_put(tw);
1048 				return;
1049 			}
1050 
1051 			if (aoh)
1052 				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1053 		}
1054 	}
1055 	if (key.ao_key) {
1056 		struct tcp_ao_key *rnext_key;
1057 
1058 		key.traffic_key = snd_other_key(key.ao_key);
1059 		key.sne = READ_ONCE(ao_info->snd_sne);
1060 		rnext_key = READ_ONCE(ao_info->rnext_key);
1061 		key.rcv_next = rnext_key->rcvid;
1062 		key.type = TCP_KEY_AO;
1063 #else
1064 	if (0) {
1065 #endif
1066 	} else if (static_branch_tcp_md5()) {
1067 		key.md5_key = tcp_twsk_md5_key(tcptw);
1068 		if (key.md5_key)
1069 			key.type = TCP_KEY_MD5;
1070 	}
1071 
1072 	tcp_v4_send_ack(sk, skb,
1073 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1074 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1075 			tcp_tw_tsval(tcptw),
1076 			READ_ONCE(tcptw->tw_ts_recent),
1077 			tw->tw_bound_dev_if, &key,
1078 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1079 			tw->tw_tos,
1080 			tw->tw_txhash);
1081 
1082 	inet_twsk_put(tw);
1083 }
1084 
1085 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1086 				  struct request_sock *req)
1087 {
1088 	struct tcp_key key = {};
1089 
1090 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1091 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1092 	 */
1093 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1094 					     tcp_sk(sk)->snd_nxt;
1095 
1096 #ifdef CONFIG_TCP_AO
1097 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1098 	    tcp_rsk_used_ao(req)) {
1099 		const union tcp_md5_addr *addr;
1100 		const struct tcp_ao_hdr *aoh;
1101 		int l3index;
1102 
1103 		/* Invalid TCP option size or twice included auth */
1104 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1105 			return;
1106 		if (!aoh)
1107 			return;
1108 
1109 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1110 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1111 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1112 					      aoh->rnext_keyid, -1);
1113 		if (unlikely(!key.ao_key)) {
1114 			/* Send ACK with any matching MKT for the peer */
1115 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1116 			/* Matching key disappeared (user removed the key?)
1117 			 * let the handshake timeout.
1118 			 */
1119 			if (!key.ao_key) {
1120 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1121 						     addr,
1122 						     ntohs(tcp_hdr(skb)->source),
1123 						     &ip_hdr(skb)->daddr,
1124 						     ntohs(tcp_hdr(skb)->dest));
1125 				return;
1126 			}
1127 		}
1128 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1129 		if (!key.traffic_key)
1130 			return;
1131 
1132 		key.type = TCP_KEY_AO;
1133 		key.rcv_next = aoh->keyid;
1134 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1135 #else
1136 	if (0) {
1137 #endif
1138 	} else if (static_branch_tcp_md5()) {
1139 		const union tcp_md5_addr *addr;
1140 		int l3index;
1141 
1142 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1143 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1144 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1145 		if (key.md5_key)
1146 			key.type = TCP_KEY_MD5;
1147 	}
1148 
1149 	tcp_v4_send_ack(sk, skb, seq,
1150 			tcp_rsk(req)->rcv_nxt,
1151 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1152 			tcp_rsk_tsval(tcp_rsk(req)),
1153 			READ_ONCE(req->ts_recent),
1154 			0, &key,
1155 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1156 			ip_hdr(skb)->tos,
1157 			READ_ONCE(tcp_rsk(req)->txhash));
1158 	if (tcp_key_is_ao(&key))
1159 		kfree(key.traffic_key);
1160 }
1161 
1162 /*
1163  *	Send a SYN-ACK after having received a SYN.
1164  *	This still operates on a request_sock only, not on a big
1165  *	socket.
1166  */
1167 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1168 			      struct flowi *fl,
1169 			      struct request_sock *req,
1170 			      struct tcp_fastopen_cookie *foc,
1171 			      enum tcp_synack_type synack_type,
1172 			      struct sk_buff *syn_skb)
1173 {
1174 	const struct inet_request_sock *ireq = inet_rsk(req);
1175 	struct flowi4 fl4;
1176 	int err = -1;
1177 	struct sk_buff *skb;
1178 	u8 tos;
1179 
1180 	/* First, grab a route. */
1181 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1182 		return -1;
1183 
1184 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1185 
1186 	if (skb) {
1187 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1188 
1189 		tos = READ_ONCE(inet_sk(sk)->tos);
1190 
1191 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1192 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1193 			      (tos & INET_ECN_MASK);
1194 
1195 		if (!INET_ECN_is_capable(tos) &&
1196 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1197 			tos |= INET_ECN_ECT_0;
1198 
1199 		rcu_read_lock();
1200 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1201 					    ireq->ir_rmt_addr,
1202 					    rcu_dereference(ireq->ireq_opt),
1203 					    tos);
1204 		rcu_read_unlock();
1205 		err = net_xmit_eval(err);
1206 	}
1207 
1208 	return err;
1209 }
1210 
1211 /*
1212  *	IPv4 request_sock destructor.
1213  */
1214 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1215 {
1216 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1217 }
1218 
1219 #ifdef CONFIG_TCP_MD5SIG
1220 /*
1221  * RFC2385 MD5 checksumming requires a mapping of
1222  * IP address->MD5 Key.
1223  * We need to maintain these in the sk structure.
1224  */
1225 
1226 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1227 EXPORT_SYMBOL(tcp_md5_needed);
1228 
1229 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1230 {
1231 	if (!old)
1232 		return true;
1233 
1234 	/* l3index always overrides non-l3index */
1235 	if (old->l3index && new->l3index == 0)
1236 		return false;
1237 	if (old->l3index == 0 && new->l3index)
1238 		return true;
1239 
1240 	return old->prefixlen < new->prefixlen;
1241 }
1242 
1243 /* Find the Key structure for an address.  */
1244 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1245 					   const union tcp_md5_addr *addr,
1246 					   int family, bool any_l3index)
1247 {
1248 	const struct tcp_sock *tp = tcp_sk(sk);
1249 	struct tcp_md5sig_key *key;
1250 	const struct tcp_md5sig_info *md5sig;
1251 	__be32 mask;
1252 	struct tcp_md5sig_key *best_match = NULL;
1253 	bool match;
1254 
1255 	/* caller either holds rcu_read_lock() or socket lock */
1256 	md5sig = rcu_dereference_check(tp->md5sig_info,
1257 				       lockdep_sock_is_held(sk));
1258 	if (!md5sig)
1259 		return NULL;
1260 
1261 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1262 				 lockdep_sock_is_held(sk)) {
1263 		if (key->family != family)
1264 			continue;
1265 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1266 		    key->l3index != l3index)
1267 			continue;
1268 		if (family == AF_INET) {
1269 			mask = inet_make_mask(key->prefixlen);
1270 			match = (key->addr.a4.s_addr & mask) ==
1271 				(addr->a4.s_addr & mask);
1272 #if IS_ENABLED(CONFIG_IPV6)
1273 		} else if (family == AF_INET6) {
1274 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1275 						  key->prefixlen);
1276 #endif
1277 		} else {
1278 			match = false;
1279 		}
1280 
1281 		if (match && better_md5_match(best_match, key))
1282 			best_match = key;
1283 	}
1284 	return best_match;
1285 }
1286 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1287 
1288 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1289 						      const union tcp_md5_addr *addr,
1290 						      int family, u8 prefixlen,
1291 						      int l3index, u8 flags)
1292 {
1293 	const struct tcp_sock *tp = tcp_sk(sk);
1294 	struct tcp_md5sig_key *key;
1295 	unsigned int size = sizeof(struct in_addr);
1296 	const struct tcp_md5sig_info *md5sig;
1297 
1298 	/* caller either holds rcu_read_lock() or socket lock */
1299 	md5sig = rcu_dereference_check(tp->md5sig_info,
1300 				       lockdep_sock_is_held(sk));
1301 	if (!md5sig)
1302 		return NULL;
1303 #if IS_ENABLED(CONFIG_IPV6)
1304 	if (family == AF_INET6)
1305 		size = sizeof(struct in6_addr);
1306 #endif
1307 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1308 				 lockdep_sock_is_held(sk)) {
1309 		if (key->family != family)
1310 			continue;
1311 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1312 			continue;
1313 		if (key->l3index != l3index)
1314 			continue;
1315 		if (!memcmp(&key->addr, addr, size) &&
1316 		    key->prefixlen == prefixlen)
1317 			return key;
1318 	}
1319 	return NULL;
1320 }
1321 
1322 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1323 					 const struct sock *addr_sk)
1324 {
1325 	const union tcp_md5_addr *addr;
1326 	int l3index;
1327 
1328 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1329 						 addr_sk->sk_bound_dev_if);
1330 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1331 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1332 }
1333 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1334 
1335 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1336 {
1337 	struct tcp_sock *tp = tcp_sk(sk);
1338 	struct tcp_md5sig_info *md5sig;
1339 
1340 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1341 	if (!md5sig)
1342 		return -ENOMEM;
1343 
1344 	sk_gso_disable(sk);
1345 	INIT_HLIST_HEAD(&md5sig->head);
1346 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1347 	return 0;
1348 }
1349 
1350 /* This can be called on a newly created socket, from other files */
1351 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1352 			    int family, u8 prefixlen, int l3index, u8 flags,
1353 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1354 {
1355 	/* Add Key to the list */
1356 	struct tcp_md5sig_key *key;
1357 	struct tcp_sock *tp = tcp_sk(sk);
1358 	struct tcp_md5sig_info *md5sig;
1359 
1360 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1361 	if (key) {
1362 		/* Pre-existing entry - just update that one.
1363 		 * Note that the key might be used concurrently.
1364 		 * data_race() is telling kcsan that we do not care of
1365 		 * key mismatches, since changing MD5 key on live flows
1366 		 * can lead to packet drops.
1367 		 */
1368 		data_race(memcpy(key->key, newkey, newkeylen));
1369 
1370 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1371 		 * Also note that a reader could catch new key->keylen value
1372 		 * but old key->key[], this is the reason we use __GFP_ZERO
1373 		 * at sock_kmalloc() time below these lines.
1374 		 */
1375 		WRITE_ONCE(key->keylen, newkeylen);
1376 
1377 		return 0;
1378 	}
1379 
1380 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1381 					   lockdep_sock_is_held(sk));
1382 
1383 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1384 	if (!key)
1385 		return -ENOMEM;
1386 
1387 	memcpy(key->key, newkey, newkeylen);
1388 	key->keylen = newkeylen;
1389 	key->family = family;
1390 	key->prefixlen = prefixlen;
1391 	key->l3index = l3index;
1392 	key->flags = flags;
1393 	memcpy(&key->addr, addr,
1394 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1395 								 sizeof(struct in_addr));
1396 	hlist_add_head_rcu(&key->node, &md5sig->head);
1397 	return 0;
1398 }
1399 
1400 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1401 		   int family, u8 prefixlen, int l3index, u8 flags,
1402 		   const u8 *newkey, u8 newkeylen)
1403 {
1404 	struct tcp_sock *tp = tcp_sk(sk);
1405 
1406 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1407 		if (tcp_md5_alloc_sigpool())
1408 			return -ENOMEM;
1409 
1410 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1411 			tcp_md5_release_sigpool();
1412 			return -ENOMEM;
1413 		}
1414 
1415 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1416 			struct tcp_md5sig_info *md5sig;
1417 
1418 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1419 			rcu_assign_pointer(tp->md5sig_info, NULL);
1420 			kfree_rcu(md5sig, rcu);
1421 			tcp_md5_release_sigpool();
1422 			return -EUSERS;
1423 		}
1424 	}
1425 
1426 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1427 				newkey, newkeylen, GFP_KERNEL);
1428 }
1429 EXPORT_SYMBOL(tcp_md5_do_add);
1430 
1431 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1432 		     int family, u8 prefixlen, int l3index,
1433 		     struct tcp_md5sig_key *key)
1434 {
1435 	struct tcp_sock *tp = tcp_sk(sk);
1436 
1437 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1438 		tcp_md5_add_sigpool();
1439 
1440 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1441 			tcp_md5_release_sigpool();
1442 			return -ENOMEM;
1443 		}
1444 
1445 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1446 			struct tcp_md5sig_info *md5sig;
1447 
1448 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1449 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1450 			rcu_assign_pointer(tp->md5sig_info, NULL);
1451 			kfree_rcu(md5sig, rcu);
1452 			tcp_md5_release_sigpool();
1453 			return -EUSERS;
1454 		}
1455 	}
1456 
1457 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1458 				key->flags, key->key, key->keylen,
1459 				sk_gfp_mask(sk, GFP_ATOMIC));
1460 }
1461 EXPORT_SYMBOL(tcp_md5_key_copy);
1462 
1463 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1464 		   u8 prefixlen, int l3index, u8 flags)
1465 {
1466 	struct tcp_md5sig_key *key;
1467 
1468 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1469 	if (!key)
1470 		return -ENOENT;
1471 	hlist_del_rcu(&key->node);
1472 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1473 	kfree_rcu(key, rcu);
1474 	return 0;
1475 }
1476 EXPORT_SYMBOL(tcp_md5_do_del);
1477 
1478 void tcp_clear_md5_list(struct sock *sk)
1479 {
1480 	struct tcp_sock *tp = tcp_sk(sk);
1481 	struct tcp_md5sig_key *key;
1482 	struct hlist_node *n;
1483 	struct tcp_md5sig_info *md5sig;
1484 
1485 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1486 
1487 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1488 		hlist_del_rcu(&key->node);
1489 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1490 		kfree_rcu(key, rcu);
1491 	}
1492 }
1493 
1494 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1495 				 sockptr_t optval, int optlen)
1496 {
1497 	struct tcp_md5sig cmd;
1498 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1499 	const union tcp_md5_addr *addr;
1500 	u8 prefixlen = 32;
1501 	int l3index = 0;
1502 	bool l3flag;
1503 	u8 flags;
1504 
1505 	if (optlen < sizeof(cmd))
1506 		return -EINVAL;
1507 
1508 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1509 		return -EFAULT;
1510 
1511 	if (sin->sin_family != AF_INET)
1512 		return -EINVAL;
1513 
1514 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1515 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1516 
1517 	if (optname == TCP_MD5SIG_EXT &&
1518 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1519 		prefixlen = cmd.tcpm_prefixlen;
1520 		if (prefixlen > 32)
1521 			return -EINVAL;
1522 	}
1523 
1524 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1525 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1526 		struct net_device *dev;
1527 
1528 		rcu_read_lock();
1529 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1530 		if (dev && netif_is_l3_master(dev))
1531 			l3index = dev->ifindex;
1532 
1533 		rcu_read_unlock();
1534 
1535 		/* ok to reference set/not set outside of rcu;
1536 		 * right now device MUST be an L3 master
1537 		 */
1538 		if (!dev || !l3index)
1539 			return -EINVAL;
1540 	}
1541 
1542 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1543 
1544 	if (!cmd.tcpm_keylen)
1545 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1546 
1547 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1548 		return -EINVAL;
1549 
1550 	/* Don't allow keys for peers that have a matching TCP-AO key.
1551 	 * See the comment in tcp_ao_add_cmd()
1552 	 */
1553 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1554 		return -EKEYREJECTED;
1555 
1556 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1557 			      cmd.tcpm_key, cmd.tcpm_keylen);
1558 }
1559 
1560 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1561 				   __be32 daddr, __be32 saddr,
1562 				   const struct tcphdr *th, int nbytes)
1563 {
1564 	struct tcp4_pseudohdr *bp;
1565 	struct scatterlist sg;
1566 	struct tcphdr *_th;
1567 
1568 	bp = hp->scratch;
1569 	bp->saddr = saddr;
1570 	bp->daddr = daddr;
1571 	bp->pad = 0;
1572 	bp->protocol = IPPROTO_TCP;
1573 	bp->len = cpu_to_be16(nbytes);
1574 
1575 	_th = (struct tcphdr *)(bp + 1);
1576 	memcpy(_th, th, sizeof(*th));
1577 	_th->check = 0;
1578 
1579 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1580 	ahash_request_set_crypt(hp->req, &sg, NULL,
1581 				sizeof(*bp) + sizeof(*th));
1582 	return crypto_ahash_update(hp->req);
1583 }
1584 
1585 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1586 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1587 {
1588 	struct tcp_sigpool hp;
1589 
1590 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1591 		goto clear_hash_nostart;
1592 
1593 	if (crypto_ahash_init(hp.req))
1594 		goto clear_hash;
1595 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1596 		goto clear_hash;
1597 	if (tcp_md5_hash_key(&hp, key))
1598 		goto clear_hash;
1599 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1600 	if (crypto_ahash_final(hp.req))
1601 		goto clear_hash;
1602 
1603 	tcp_sigpool_end(&hp);
1604 	return 0;
1605 
1606 clear_hash:
1607 	tcp_sigpool_end(&hp);
1608 clear_hash_nostart:
1609 	memset(md5_hash, 0, 16);
1610 	return 1;
1611 }
1612 
1613 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1614 			const struct sock *sk,
1615 			const struct sk_buff *skb)
1616 {
1617 	const struct tcphdr *th = tcp_hdr(skb);
1618 	struct tcp_sigpool hp;
1619 	__be32 saddr, daddr;
1620 
1621 	if (sk) { /* valid for establish/request sockets */
1622 		saddr = sk->sk_rcv_saddr;
1623 		daddr = sk->sk_daddr;
1624 	} else {
1625 		const struct iphdr *iph = ip_hdr(skb);
1626 		saddr = iph->saddr;
1627 		daddr = iph->daddr;
1628 	}
1629 
1630 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1631 		goto clear_hash_nostart;
1632 
1633 	if (crypto_ahash_init(hp.req))
1634 		goto clear_hash;
1635 
1636 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1637 		goto clear_hash;
1638 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1639 		goto clear_hash;
1640 	if (tcp_md5_hash_key(&hp, key))
1641 		goto clear_hash;
1642 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1643 	if (crypto_ahash_final(hp.req))
1644 		goto clear_hash;
1645 
1646 	tcp_sigpool_end(&hp);
1647 	return 0;
1648 
1649 clear_hash:
1650 	tcp_sigpool_end(&hp);
1651 clear_hash_nostart:
1652 	memset(md5_hash, 0, 16);
1653 	return 1;
1654 }
1655 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1656 
1657 #endif
1658 
1659 static void tcp_v4_init_req(struct request_sock *req,
1660 			    const struct sock *sk_listener,
1661 			    struct sk_buff *skb)
1662 {
1663 	struct inet_request_sock *ireq = inet_rsk(req);
1664 	struct net *net = sock_net(sk_listener);
1665 
1666 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1667 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1668 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1669 }
1670 
1671 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1672 					  struct sk_buff *skb,
1673 					  struct flowi *fl,
1674 					  struct request_sock *req,
1675 					  u32 tw_isn)
1676 {
1677 	tcp_v4_init_req(req, sk, skb);
1678 
1679 	if (security_inet_conn_request(sk, skb, req))
1680 		return NULL;
1681 
1682 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1683 }
1684 
1685 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1686 	.family		=	PF_INET,
1687 	.obj_size	=	sizeof(struct tcp_request_sock),
1688 	.rtx_syn_ack	=	tcp_rtx_synack,
1689 	.send_ack	=	tcp_v4_reqsk_send_ack,
1690 	.destructor	=	tcp_v4_reqsk_destructor,
1691 	.send_reset	=	tcp_v4_send_reset,
1692 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1693 };
1694 
1695 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1696 	.mss_clamp	=	TCP_MSS_DEFAULT,
1697 #ifdef CONFIG_TCP_MD5SIG
1698 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1699 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1700 #endif
1701 #ifdef CONFIG_TCP_AO
1702 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1703 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1704 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1705 #endif
1706 #ifdef CONFIG_SYN_COOKIES
1707 	.cookie_init_seq =	cookie_v4_init_sequence,
1708 #endif
1709 	.route_req	=	tcp_v4_route_req,
1710 	.init_seq	=	tcp_v4_init_seq,
1711 	.init_ts_off	=	tcp_v4_init_ts_off,
1712 	.send_synack	=	tcp_v4_send_synack,
1713 };
1714 
1715 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1716 {
1717 	/* Never answer to SYNs send to broadcast or multicast */
1718 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1719 		goto drop;
1720 
1721 	return tcp_conn_request(&tcp_request_sock_ops,
1722 				&tcp_request_sock_ipv4_ops, sk, skb);
1723 
1724 drop:
1725 	tcp_listendrop(sk);
1726 	return 0;
1727 }
1728 EXPORT_SYMBOL(tcp_v4_conn_request);
1729 
1730 
1731 /*
1732  * The three way handshake has completed - we got a valid synack -
1733  * now create the new socket.
1734  */
1735 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1736 				  struct request_sock *req,
1737 				  struct dst_entry *dst,
1738 				  struct request_sock *req_unhash,
1739 				  bool *own_req)
1740 {
1741 	struct inet_request_sock *ireq;
1742 	bool found_dup_sk = false;
1743 	struct inet_sock *newinet;
1744 	struct tcp_sock *newtp;
1745 	struct sock *newsk;
1746 #ifdef CONFIG_TCP_MD5SIG
1747 	const union tcp_md5_addr *addr;
1748 	struct tcp_md5sig_key *key;
1749 	int l3index;
1750 #endif
1751 	struct ip_options_rcu *inet_opt;
1752 
1753 	if (sk_acceptq_is_full(sk))
1754 		goto exit_overflow;
1755 
1756 	newsk = tcp_create_openreq_child(sk, req, skb);
1757 	if (!newsk)
1758 		goto exit_nonewsk;
1759 
1760 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1761 	inet_sk_rx_dst_set(newsk, skb);
1762 
1763 	newtp		      = tcp_sk(newsk);
1764 	newinet		      = inet_sk(newsk);
1765 	ireq		      = inet_rsk(req);
1766 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1767 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1768 	newsk->sk_bound_dev_if = ireq->ir_iif;
1769 	newinet->inet_saddr   = ireq->ir_loc_addr;
1770 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1771 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1772 	newinet->mc_index     = inet_iif(skb);
1773 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1774 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1775 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1776 	if (inet_opt)
1777 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1778 	atomic_set(&newinet->inet_id, get_random_u16());
1779 
1780 	/* Set ToS of the new socket based upon the value of incoming SYN.
1781 	 * ECT bits are set later in tcp_init_transfer().
1782 	 */
1783 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1784 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1785 
1786 	if (!dst) {
1787 		dst = inet_csk_route_child_sock(sk, newsk, req);
1788 		if (!dst)
1789 			goto put_and_exit;
1790 	} else {
1791 		/* syncookie case : see end of cookie_v4_check() */
1792 	}
1793 	sk_setup_caps(newsk, dst);
1794 
1795 	tcp_ca_openreq_child(newsk, dst);
1796 
1797 	tcp_sync_mss(newsk, dst_mtu(dst));
1798 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1799 
1800 	tcp_initialize_rcv_mss(newsk);
1801 
1802 #ifdef CONFIG_TCP_MD5SIG
1803 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1804 	/* Copy over the MD5 key from the original socket */
1805 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1806 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1807 	if (key && !tcp_rsk_used_ao(req)) {
1808 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1809 			goto put_and_exit;
1810 		sk_gso_disable(newsk);
1811 	}
1812 #endif
1813 #ifdef CONFIG_TCP_AO
1814 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1815 		goto put_and_exit; /* OOM, release back memory */
1816 #endif
1817 
1818 	if (__inet_inherit_port(sk, newsk) < 0)
1819 		goto put_and_exit;
1820 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1821 				       &found_dup_sk);
1822 	if (likely(*own_req)) {
1823 		tcp_move_syn(newtp, req);
1824 		ireq->ireq_opt = NULL;
1825 	} else {
1826 		newinet->inet_opt = NULL;
1827 
1828 		if (!req_unhash && found_dup_sk) {
1829 			/* This code path should only be executed in the
1830 			 * syncookie case only
1831 			 */
1832 			bh_unlock_sock(newsk);
1833 			sock_put(newsk);
1834 			newsk = NULL;
1835 		}
1836 	}
1837 	return newsk;
1838 
1839 exit_overflow:
1840 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1841 exit_nonewsk:
1842 	dst_release(dst);
1843 exit:
1844 	tcp_listendrop(sk);
1845 	return NULL;
1846 put_and_exit:
1847 	newinet->inet_opt = NULL;
1848 	inet_csk_prepare_forced_close(newsk);
1849 	tcp_done(newsk);
1850 	goto exit;
1851 }
1852 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1853 
1854 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1855 {
1856 #ifdef CONFIG_SYN_COOKIES
1857 	const struct tcphdr *th = tcp_hdr(skb);
1858 
1859 	if (!th->syn)
1860 		sk = cookie_v4_check(sk, skb);
1861 #endif
1862 	return sk;
1863 }
1864 
1865 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1866 			 struct tcphdr *th, u32 *cookie)
1867 {
1868 	u16 mss = 0;
1869 #ifdef CONFIG_SYN_COOKIES
1870 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1871 				    &tcp_request_sock_ipv4_ops, sk, th);
1872 	if (mss) {
1873 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1874 		tcp_synq_overflow(sk);
1875 	}
1876 #endif
1877 	return mss;
1878 }
1879 
1880 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1881 							   u32));
1882 /* The socket must have it's spinlock held when we get
1883  * here, unless it is a TCP_LISTEN socket.
1884  *
1885  * We have a potential double-lock case here, so even when
1886  * doing backlog processing we use the BH locking scheme.
1887  * This is because we cannot sleep with the original spinlock
1888  * held.
1889  */
1890 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1891 {
1892 	enum skb_drop_reason reason;
1893 	struct sock *rsk;
1894 
1895 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1896 		struct dst_entry *dst;
1897 
1898 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1899 						lockdep_sock_is_held(sk));
1900 
1901 		sock_rps_save_rxhash(sk, skb);
1902 		sk_mark_napi_id(sk, skb);
1903 		if (dst) {
1904 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1905 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1906 					     dst, 0)) {
1907 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1908 				dst_release(dst);
1909 			}
1910 		}
1911 		tcp_rcv_established(sk, skb);
1912 		return 0;
1913 	}
1914 
1915 	if (tcp_checksum_complete(skb))
1916 		goto csum_err;
1917 
1918 	if (sk->sk_state == TCP_LISTEN) {
1919 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1920 
1921 		if (!nsk)
1922 			return 0;
1923 		if (nsk != sk) {
1924 			reason = tcp_child_process(sk, nsk, skb);
1925 			if (reason) {
1926 				rsk = nsk;
1927 				goto reset;
1928 			}
1929 			return 0;
1930 		}
1931 	} else
1932 		sock_rps_save_rxhash(sk, skb);
1933 
1934 	reason = tcp_rcv_state_process(sk, skb);
1935 	if (reason) {
1936 		rsk = sk;
1937 		goto reset;
1938 	}
1939 	return 0;
1940 
1941 reset:
1942 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1943 discard:
1944 	sk_skb_reason_drop(sk, skb, reason);
1945 	/* Be careful here. If this function gets more complicated and
1946 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1947 	 * might be destroyed here. This current version compiles correctly,
1948 	 * but you have been warned.
1949 	 */
1950 	return 0;
1951 
1952 csum_err:
1953 	reason = SKB_DROP_REASON_TCP_CSUM;
1954 	trace_tcp_bad_csum(skb);
1955 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1956 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1957 	goto discard;
1958 }
1959 EXPORT_SYMBOL(tcp_v4_do_rcv);
1960 
1961 int tcp_v4_early_demux(struct sk_buff *skb)
1962 {
1963 	struct net *net = dev_net(skb->dev);
1964 	const struct iphdr *iph;
1965 	const struct tcphdr *th;
1966 	struct sock *sk;
1967 
1968 	if (skb->pkt_type != PACKET_HOST)
1969 		return 0;
1970 
1971 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1972 		return 0;
1973 
1974 	iph = ip_hdr(skb);
1975 	th = tcp_hdr(skb);
1976 
1977 	if (th->doff < sizeof(struct tcphdr) / 4)
1978 		return 0;
1979 
1980 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1981 				       iph->saddr, th->source,
1982 				       iph->daddr, ntohs(th->dest),
1983 				       skb->skb_iif, inet_sdif(skb));
1984 	if (sk) {
1985 		skb->sk = sk;
1986 		skb->destructor = sock_edemux;
1987 		if (sk_fullsock(sk)) {
1988 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1989 
1990 			if (dst)
1991 				dst = dst_check(dst, 0);
1992 			if (dst &&
1993 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1994 				skb_dst_set_noref(skb, dst);
1995 		}
1996 	}
1997 	return 0;
1998 }
1999 
2000 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2001 		     enum skb_drop_reason *reason)
2002 {
2003 	u32 tail_gso_size, tail_gso_segs;
2004 	struct skb_shared_info *shinfo;
2005 	const struct tcphdr *th;
2006 	struct tcphdr *thtail;
2007 	struct sk_buff *tail;
2008 	unsigned int hdrlen;
2009 	bool fragstolen;
2010 	u32 gso_segs;
2011 	u32 gso_size;
2012 	u64 limit;
2013 	int delta;
2014 
2015 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2016 	 * we can fix skb->truesize to its real value to avoid future drops.
2017 	 * This is valid because skb is not yet charged to the socket.
2018 	 * It has been noticed pure SACK packets were sometimes dropped
2019 	 * (if cooked by drivers without copybreak feature).
2020 	 */
2021 	skb_condense(skb);
2022 
2023 	skb_dst_drop(skb);
2024 
2025 	if (unlikely(tcp_checksum_complete(skb))) {
2026 		bh_unlock_sock(sk);
2027 		trace_tcp_bad_csum(skb);
2028 		*reason = SKB_DROP_REASON_TCP_CSUM;
2029 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2030 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2031 		return true;
2032 	}
2033 
2034 	/* Attempt coalescing to last skb in backlog, even if we are
2035 	 * above the limits.
2036 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2037 	 */
2038 	th = (const struct tcphdr *)skb->data;
2039 	hdrlen = th->doff * 4;
2040 
2041 	tail = sk->sk_backlog.tail;
2042 	if (!tail)
2043 		goto no_coalesce;
2044 	thtail = (struct tcphdr *)tail->data;
2045 
2046 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2047 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2048 	    ((TCP_SKB_CB(tail)->tcp_flags |
2049 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2050 	    !((TCP_SKB_CB(tail)->tcp_flags &
2051 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2052 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2053 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2054 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2055 	    thtail->doff != th->doff ||
2056 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2057 		goto no_coalesce;
2058 
2059 	__skb_pull(skb, hdrlen);
2060 
2061 	shinfo = skb_shinfo(skb);
2062 	gso_size = shinfo->gso_size ?: skb->len;
2063 	gso_segs = shinfo->gso_segs ?: 1;
2064 
2065 	shinfo = skb_shinfo(tail);
2066 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2067 	tail_gso_segs = shinfo->gso_segs ?: 1;
2068 
2069 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2070 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2071 
2072 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2073 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2074 			thtail->window = th->window;
2075 		}
2076 
2077 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2078 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2079 		 * is not entered if we append a packet with a FIN.
2080 		 * SYN, RST, URG are not present.
2081 		 * ACK is set on both packets.
2082 		 * PSH : we do not really care in TCP stack,
2083 		 *       at least for 'GRO' packets.
2084 		 */
2085 		thtail->fin |= th->fin;
2086 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2087 
2088 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2089 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2090 			tail->tstamp = skb->tstamp;
2091 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2092 		}
2093 
2094 		/* Not as strict as GRO. We only need to carry mss max value */
2095 		shinfo->gso_size = max(gso_size, tail_gso_size);
2096 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2097 
2098 		sk->sk_backlog.len += delta;
2099 		__NET_INC_STATS(sock_net(sk),
2100 				LINUX_MIB_TCPBACKLOGCOALESCE);
2101 		kfree_skb_partial(skb, fragstolen);
2102 		return false;
2103 	}
2104 	__skb_push(skb, hdrlen);
2105 
2106 no_coalesce:
2107 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2108 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2109 	 * sk_rcvbuf in normal conditions.
2110 	 */
2111 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2112 
2113 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2114 
2115 	/* Only socket owner can try to collapse/prune rx queues
2116 	 * to reduce memory overhead, so add a little headroom here.
2117 	 * Few sockets backlog are possibly concurrently non empty.
2118 	 */
2119 	limit += 64 * 1024;
2120 
2121 	limit = min_t(u64, limit, UINT_MAX);
2122 
2123 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2124 		bh_unlock_sock(sk);
2125 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2126 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2127 		return true;
2128 	}
2129 	return false;
2130 }
2131 EXPORT_SYMBOL(tcp_add_backlog);
2132 
2133 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2134 {
2135 	struct tcphdr *th = (struct tcphdr *)skb->data;
2136 
2137 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2138 }
2139 EXPORT_SYMBOL(tcp_filter);
2140 
2141 static void tcp_v4_restore_cb(struct sk_buff *skb)
2142 {
2143 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2144 		sizeof(struct inet_skb_parm));
2145 }
2146 
2147 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2148 			   const struct tcphdr *th)
2149 {
2150 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2151 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2152 	 */
2153 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2154 		sizeof(struct inet_skb_parm));
2155 	barrier();
2156 
2157 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2158 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2159 				    skb->len - th->doff * 4);
2160 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2161 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2162 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2163 	TCP_SKB_CB(skb)->sacked	 = 0;
2164 	TCP_SKB_CB(skb)->has_rxtstamp =
2165 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2166 }
2167 
2168 /*
2169  *	From tcp_input.c
2170  */
2171 
2172 int tcp_v4_rcv(struct sk_buff *skb)
2173 {
2174 	struct net *net = dev_net(skb->dev);
2175 	enum skb_drop_reason drop_reason;
2176 	int sdif = inet_sdif(skb);
2177 	int dif = inet_iif(skb);
2178 	const struct iphdr *iph;
2179 	const struct tcphdr *th;
2180 	struct sock *sk = NULL;
2181 	bool refcounted;
2182 	int ret;
2183 	u32 isn;
2184 
2185 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2186 	if (skb->pkt_type != PACKET_HOST)
2187 		goto discard_it;
2188 
2189 	/* Count it even if it's bad */
2190 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2191 
2192 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2193 		goto discard_it;
2194 
2195 	th = (const struct tcphdr *)skb->data;
2196 
2197 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2198 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2199 		goto bad_packet;
2200 	}
2201 	if (!pskb_may_pull(skb, th->doff * 4))
2202 		goto discard_it;
2203 
2204 	/* An explanation is required here, I think.
2205 	 * Packet length and doff are validated by header prediction,
2206 	 * provided case of th->doff==0 is eliminated.
2207 	 * So, we defer the checks. */
2208 
2209 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2210 		goto csum_error;
2211 
2212 	th = (const struct tcphdr *)skb->data;
2213 	iph = ip_hdr(skb);
2214 lookup:
2215 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2216 			       skb, __tcp_hdrlen(th), th->source,
2217 			       th->dest, sdif, &refcounted);
2218 	if (!sk)
2219 		goto no_tcp_socket;
2220 
2221 	if (sk->sk_state == TCP_TIME_WAIT)
2222 		goto do_time_wait;
2223 
2224 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2225 		struct request_sock *req = inet_reqsk(sk);
2226 		bool req_stolen = false;
2227 		struct sock *nsk;
2228 
2229 		sk = req->rsk_listener;
2230 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2231 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2232 		else
2233 			drop_reason = tcp_inbound_hash(sk, req, skb,
2234 						       &iph->saddr, &iph->daddr,
2235 						       AF_INET, dif, sdif);
2236 		if (unlikely(drop_reason)) {
2237 			sk_drops_add(sk, skb);
2238 			reqsk_put(req);
2239 			goto discard_it;
2240 		}
2241 		if (tcp_checksum_complete(skb)) {
2242 			reqsk_put(req);
2243 			goto csum_error;
2244 		}
2245 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2246 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2247 			if (!nsk) {
2248 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2249 				goto lookup;
2250 			}
2251 			sk = nsk;
2252 			/* reuseport_migrate_sock() has already held one sk_refcnt
2253 			 * before returning.
2254 			 */
2255 		} else {
2256 			/* We own a reference on the listener, increase it again
2257 			 * as we might lose it too soon.
2258 			 */
2259 			sock_hold(sk);
2260 		}
2261 		refcounted = true;
2262 		nsk = NULL;
2263 		if (!tcp_filter(sk, skb)) {
2264 			th = (const struct tcphdr *)skb->data;
2265 			iph = ip_hdr(skb);
2266 			tcp_v4_fill_cb(skb, iph, th);
2267 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2268 		} else {
2269 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2270 		}
2271 		if (!nsk) {
2272 			reqsk_put(req);
2273 			if (req_stolen) {
2274 				/* Another cpu got exclusive access to req
2275 				 * and created a full blown socket.
2276 				 * Try to feed this packet to this socket
2277 				 * instead of discarding it.
2278 				 */
2279 				tcp_v4_restore_cb(skb);
2280 				sock_put(sk);
2281 				goto lookup;
2282 			}
2283 			goto discard_and_relse;
2284 		}
2285 		nf_reset_ct(skb);
2286 		if (nsk == sk) {
2287 			reqsk_put(req);
2288 			tcp_v4_restore_cb(skb);
2289 		} else {
2290 			drop_reason = tcp_child_process(sk, nsk, skb);
2291 			if (drop_reason) {
2292 				enum sk_rst_reason rst_reason;
2293 
2294 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2295 				tcp_v4_send_reset(nsk, skb, rst_reason);
2296 				goto discard_and_relse;
2297 			}
2298 			sock_put(sk);
2299 			return 0;
2300 		}
2301 	}
2302 
2303 process:
2304 	if (static_branch_unlikely(&ip4_min_ttl)) {
2305 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2306 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2307 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2308 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2309 			goto discard_and_relse;
2310 		}
2311 	}
2312 
2313 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2314 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2315 		goto discard_and_relse;
2316 	}
2317 
2318 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2319 				       AF_INET, dif, sdif);
2320 	if (drop_reason)
2321 		goto discard_and_relse;
2322 
2323 	nf_reset_ct(skb);
2324 
2325 	if (tcp_filter(sk, skb)) {
2326 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2327 		goto discard_and_relse;
2328 	}
2329 	th = (const struct tcphdr *)skb->data;
2330 	iph = ip_hdr(skb);
2331 	tcp_v4_fill_cb(skb, iph, th);
2332 
2333 	skb->dev = NULL;
2334 
2335 	if (sk->sk_state == TCP_LISTEN) {
2336 		ret = tcp_v4_do_rcv(sk, skb);
2337 		goto put_and_return;
2338 	}
2339 
2340 	sk_incoming_cpu_update(sk);
2341 
2342 	bh_lock_sock_nested(sk);
2343 	tcp_segs_in(tcp_sk(sk), skb);
2344 	ret = 0;
2345 	if (!sock_owned_by_user(sk)) {
2346 		ret = tcp_v4_do_rcv(sk, skb);
2347 	} else {
2348 		if (tcp_add_backlog(sk, skb, &drop_reason))
2349 			goto discard_and_relse;
2350 	}
2351 	bh_unlock_sock(sk);
2352 
2353 put_and_return:
2354 	if (refcounted)
2355 		sock_put(sk);
2356 
2357 	return ret;
2358 
2359 no_tcp_socket:
2360 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2361 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2362 		goto discard_it;
2363 
2364 	tcp_v4_fill_cb(skb, iph, th);
2365 
2366 	if (tcp_checksum_complete(skb)) {
2367 csum_error:
2368 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2369 		trace_tcp_bad_csum(skb);
2370 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2371 bad_packet:
2372 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2373 	} else {
2374 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2375 	}
2376 
2377 discard_it:
2378 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2379 	/* Discard frame. */
2380 	sk_skb_reason_drop(sk, skb, drop_reason);
2381 	return 0;
2382 
2383 discard_and_relse:
2384 	sk_drops_add(sk, skb);
2385 	if (refcounted)
2386 		sock_put(sk);
2387 	goto discard_it;
2388 
2389 do_time_wait:
2390 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2391 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2392 		inet_twsk_put(inet_twsk(sk));
2393 		goto discard_it;
2394 	}
2395 
2396 	tcp_v4_fill_cb(skb, iph, th);
2397 
2398 	if (tcp_checksum_complete(skb)) {
2399 		inet_twsk_put(inet_twsk(sk));
2400 		goto csum_error;
2401 	}
2402 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2403 	case TCP_TW_SYN: {
2404 		struct sock *sk2 = inet_lookup_listener(net,
2405 							net->ipv4.tcp_death_row.hashinfo,
2406 							skb, __tcp_hdrlen(th),
2407 							iph->saddr, th->source,
2408 							iph->daddr, th->dest,
2409 							inet_iif(skb),
2410 							sdif);
2411 		if (sk2) {
2412 			inet_twsk_deschedule_put(inet_twsk(sk));
2413 			sk = sk2;
2414 			tcp_v4_restore_cb(skb);
2415 			refcounted = false;
2416 			__this_cpu_write(tcp_tw_isn, isn);
2417 			goto process;
2418 		}
2419 	}
2420 		/* to ACK */
2421 		fallthrough;
2422 	case TCP_TW_ACK:
2423 		tcp_v4_timewait_ack(sk, skb);
2424 		break;
2425 	case TCP_TW_RST:
2426 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2427 		inet_twsk_deschedule_put(inet_twsk(sk));
2428 		goto discard_it;
2429 	case TCP_TW_SUCCESS:;
2430 	}
2431 	goto discard_it;
2432 }
2433 
2434 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2435 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2436 	.twsk_destructor= tcp_twsk_destructor,
2437 };
2438 
2439 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2440 {
2441 	struct dst_entry *dst = skb_dst(skb);
2442 
2443 	if (dst && dst_hold_safe(dst)) {
2444 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2445 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2446 	}
2447 }
2448 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2449 
2450 const struct inet_connection_sock_af_ops ipv4_specific = {
2451 	.queue_xmit	   = ip_queue_xmit,
2452 	.send_check	   = tcp_v4_send_check,
2453 	.rebuild_header	   = inet_sk_rebuild_header,
2454 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2455 	.conn_request	   = tcp_v4_conn_request,
2456 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2457 	.net_header_len	   = sizeof(struct iphdr),
2458 	.setsockopt	   = ip_setsockopt,
2459 	.getsockopt	   = ip_getsockopt,
2460 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2461 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2462 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2463 };
2464 EXPORT_SYMBOL(ipv4_specific);
2465 
2466 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2467 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2468 #ifdef CONFIG_TCP_MD5SIG
2469 	.md5_lookup		= tcp_v4_md5_lookup,
2470 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2471 	.md5_parse		= tcp_v4_parse_md5_keys,
2472 #endif
2473 #ifdef CONFIG_TCP_AO
2474 	.ao_lookup		= tcp_v4_ao_lookup,
2475 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2476 	.ao_parse		= tcp_v4_parse_ao,
2477 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2478 #endif
2479 };
2480 #endif
2481 
2482 /* NOTE: A lot of things set to zero explicitly by call to
2483  *       sk_alloc() so need not be done here.
2484  */
2485 static int tcp_v4_init_sock(struct sock *sk)
2486 {
2487 	struct inet_connection_sock *icsk = inet_csk(sk);
2488 
2489 	tcp_init_sock(sk);
2490 
2491 	icsk->icsk_af_ops = &ipv4_specific;
2492 
2493 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2494 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2495 #endif
2496 
2497 	return 0;
2498 }
2499 
2500 #ifdef CONFIG_TCP_MD5SIG
2501 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2502 {
2503 	struct tcp_md5sig_info *md5sig;
2504 
2505 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2506 	kfree(md5sig);
2507 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2508 	tcp_md5_release_sigpool();
2509 }
2510 #endif
2511 
2512 void tcp_v4_destroy_sock(struct sock *sk)
2513 {
2514 	struct tcp_sock *tp = tcp_sk(sk);
2515 
2516 	trace_tcp_destroy_sock(sk);
2517 
2518 	tcp_clear_xmit_timers(sk);
2519 
2520 	tcp_cleanup_congestion_control(sk);
2521 
2522 	tcp_cleanup_ulp(sk);
2523 
2524 	/* Cleanup up the write buffer. */
2525 	tcp_write_queue_purge(sk);
2526 
2527 	/* Check if we want to disable active TFO */
2528 	tcp_fastopen_active_disable_ofo_check(sk);
2529 
2530 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2531 	skb_rbtree_purge(&tp->out_of_order_queue);
2532 
2533 #ifdef CONFIG_TCP_MD5SIG
2534 	/* Clean up the MD5 key list, if any */
2535 	if (tp->md5sig_info) {
2536 		struct tcp_md5sig_info *md5sig;
2537 
2538 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2539 		tcp_clear_md5_list(sk);
2540 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2541 		rcu_assign_pointer(tp->md5sig_info, NULL);
2542 	}
2543 #endif
2544 	tcp_ao_destroy_sock(sk, false);
2545 
2546 	/* Clean up a referenced TCP bind bucket. */
2547 	if (inet_csk(sk)->icsk_bind_hash)
2548 		inet_put_port(sk);
2549 
2550 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2551 
2552 	/* If socket is aborted during connect operation */
2553 	tcp_free_fastopen_req(tp);
2554 	tcp_fastopen_destroy_cipher(sk);
2555 	tcp_saved_syn_free(tp);
2556 
2557 	sk_sockets_allocated_dec(sk);
2558 }
2559 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2560 
2561 #ifdef CONFIG_PROC_FS
2562 /* Proc filesystem TCP sock list dumping. */
2563 
2564 static unsigned short seq_file_family(const struct seq_file *seq);
2565 
2566 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2567 {
2568 	unsigned short family = seq_file_family(seq);
2569 
2570 	/* AF_UNSPEC is used as a match all */
2571 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2572 		net_eq(sock_net(sk), seq_file_net(seq)));
2573 }
2574 
2575 /* Find a non empty bucket (starting from st->bucket)
2576  * and return the first sk from it.
2577  */
2578 static void *listening_get_first(struct seq_file *seq)
2579 {
2580 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2581 	struct tcp_iter_state *st = seq->private;
2582 
2583 	st->offset = 0;
2584 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2585 		struct inet_listen_hashbucket *ilb2;
2586 		struct hlist_nulls_node *node;
2587 		struct sock *sk;
2588 
2589 		ilb2 = &hinfo->lhash2[st->bucket];
2590 		if (hlist_nulls_empty(&ilb2->nulls_head))
2591 			continue;
2592 
2593 		spin_lock(&ilb2->lock);
2594 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2595 			if (seq_sk_match(seq, sk))
2596 				return sk;
2597 		}
2598 		spin_unlock(&ilb2->lock);
2599 	}
2600 
2601 	return NULL;
2602 }
2603 
2604 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2605  * If "cur" is the last one in the st->bucket,
2606  * call listening_get_first() to return the first sk of the next
2607  * non empty bucket.
2608  */
2609 static void *listening_get_next(struct seq_file *seq, void *cur)
2610 {
2611 	struct tcp_iter_state *st = seq->private;
2612 	struct inet_listen_hashbucket *ilb2;
2613 	struct hlist_nulls_node *node;
2614 	struct inet_hashinfo *hinfo;
2615 	struct sock *sk = cur;
2616 
2617 	++st->num;
2618 	++st->offset;
2619 
2620 	sk = sk_nulls_next(sk);
2621 	sk_nulls_for_each_from(sk, node) {
2622 		if (seq_sk_match(seq, sk))
2623 			return sk;
2624 	}
2625 
2626 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2627 	ilb2 = &hinfo->lhash2[st->bucket];
2628 	spin_unlock(&ilb2->lock);
2629 	++st->bucket;
2630 	return listening_get_first(seq);
2631 }
2632 
2633 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2634 {
2635 	struct tcp_iter_state *st = seq->private;
2636 	void *rc;
2637 
2638 	st->bucket = 0;
2639 	st->offset = 0;
2640 	rc = listening_get_first(seq);
2641 
2642 	while (rc && *pos) {
2643 		rc = listening_get_next(seq, rc);
2644 		--*pos;
2645 	}
2646 	return rc;
2647 }
2648 
2649 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2650 				const struct tcp_iter_state *st)
2651 {
2652 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2653 }
2654 
2655 /*
2656  * Get first established socket starting from bucket given in st->bucket.
2657  * If st->bucket is zero, the very first socket in the hash is returned.
2658  */
2659 static void *established_get_first(struct seq_file *seq)
2660 {
2661 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2662 	struct tcp_iter_state *st = seq->private;
2663 
2664 	st->offset = 0;
2665 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2666 		struct sock *sk;
2667 		struct hlist_nulls_node *node;
2668 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2669 
2670 		cond_resched();
2671 
2672 		/* Lockless fast path for the common case of empty buckets */
2673 		if (empty_bucket(hinfo, st))
2674 			continue;
2675 
2676 		spin_lock_bh(lock);
2677 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2678 			if (seq_sk_match(seq, sk))
2679 				return sk;
2680 		}
2681 		spin_unlock_bh(lock);
2682 	}
2683 
2684 	return NULL;
2685 }
2686 
2687 static void *established_get_next(struct seq_file *seq, void *cur)
2688 {
2689 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2690 	struct tcp_iter_state *st = seq->private;
2691 	struct hlist_nulls_node *node;
2692 	struct sock *sk = cur;
2693 
2694 	++st->num;
2695 	++st->offset;
2696 
2697 	sk = sk_nulls_next(sk);
2698 
2699 	sk_nulls_for_each_from(sk, node) {
2700 		if (seq_sk_match(seq, sk))
2701 			return sk;
2702 	}
2703 
2704 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2705 	++st->bucket;
2706 	return established_get_first(seq);
2707 }
2708 
2709 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2710 {
2711 	struct tcp_iter_state *st = seq->private;
2712 	void *rc;
2713 
2714 	st->bucket = 0;
2715 	rc = established_get_first(seq);
2716 
2717 	while (rc && pos) {
2718 		rc = established_get_next(seq, rc);
2719 		--pos;
2720 	}
2721 	return rc;
2722 }
2723 
2724 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2725 {
2726 	void *rc;
2727 	struct tcp_iter_state *st = seq->private;
2728 
2729 	st->state = TCP_SEQ_STATE_LISTENING;
2730 	rc	  = listening_get_idx(seq, &pos);
2731 
2732 	if (!rc) {
2733 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2734 		rc	  = established_get_idx(seq, pos);
2735 	}
2736 
2737 	return rc;
2738 }
2739 
2740 static void *tcp_seek_last_pos(struct seq_file *seq)
2741 {
2742 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2743 	struct tcp_iter_state *st = seq->private;
2744 	int bucket = st->bucket;
2745 	int offset = st->offset;
2746 	int orig_num = st->num;
2747 	void *rc = NULL;
2748 
2749 	switch (st->state) {
2750 	case TCP_SEQ_STATE_LISTENING:
2751 		if (st->bucket > hinfo->lhash2_mask)
2752 			break;
2753 		rc = listening_get_first(seq);
2754 		while (offset-- && rc && bucket == st->bucket)
2755 			rc = listening_get_next(seq, rc);
2756 		if (rc)
2757 			break;
2758 		st->bucket = 0;
2759 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2760 		fallthrough;
2761 	case TCP_SEQ_STATE_ESTABLISHED:
2762 		if (st->bucket > hinfo->ehash_mask)
2763 			break;
2764 		rc = established_get_first(seq);
2765 		while (offset-- && rc && bucket == st->bucket)
2766 			rc = established_get_next(seq, rc);
2767 	}
2768 
2769 	st->num = orig_num;
2770 
2771 	return rc;
2772 }
2773 
2774 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2775 {
2776 	struct tcp_iter_state *st = seq->private;
2777 	void *rc;
2778 
2779 	if (*pos && *pos == st->last_pos) {
2780 		rc = tcp_seek_last_pos(seq);
2781 		if (rc)
2782 			goto out;
2783 	}
2784 
2785 	st->state = TCP_SEQ_STATE_LISTENING;
2786 	st->num = 0;
2787 	st->bucket = 0;
2788 	st->offset = 0;
2789 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2790 
2791 out:
2792 	st->last_pos = *pos;
2793 	return rc;
2794 }
2795 EXPORT_SYMBOL(tcp_seq_start);
2796 
2797 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2798 {
2799 	struct tcp_iter_state *st = seq->private;
2800 	void *rc = NULL;
2801 
2802 	if (v == SEQ_START_TOKEN) {
2803 		rc = tcp_get_idx(seq, 0);
2804 		goto out;
2805 	}
2806 
2807 	switch (st->state) {
2808 	case TCP_SEQ_STATE_LISTENING:
2809 		rc = listening_get_next(seq, v);
2810 		if (!rc) {
2811 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2812 			st->bucket = 0;
2813 			st->offset = 0;
2814 			rc	  = established_get_first(seq);
2815 		}
2816 		break;
2817 	case TCP_SEQ_STATE_ESTABLISHED:
2818 		rc = established_get_next(seq, v);
2819 		break;
2820 	}
2821 out:
2822 	++*pos;
2823 	st->last_pos = *pos;
2824 	return rc;
2825 }
2826 EXPORT_SYMBOL(tcp_seq_next);
2827 
2828 void tcp_seq_stop(struct seq_file *seq, void *v)
2829 {
2830 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2831 	struct tcp_iter_state *st = seq->private;
2832 
2833 	switch (st->state) {
2834 	case TCP_SEQ_STATE_LISTENING:
2835 		if (v != SEQ_START_TOKEN)
2836 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2837 		break;
2838 	case TCP_SEQ_STATE_ESTABLISHED:
2839 		if (v)
2840 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2841 		break;
2842 	}
2843 }
2844 EXPORT_SYMBOL(tcp_seq_stop);
2845 
2846 static void get_openreq4(const struct request_sock *req,
2847 			 struct seq_file *f, int i)
2848 {
2849 	const struct inet_request_sock *ireq = inet_rsk(req);
2850 	long delta = req->rsk_timer.expires - jiffies;
2851 
2852 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2853 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2854 		i,
2855 		ireq->ir_loc_addr,
2856 		ireq->ir_num,
2857 		ireq->ir_rmt_addr,
2858 		ntohs(ireq->ir_rmt_port),
2859 		TCP_SYN_RECV,
2860 		0, 0, /* could print option size, but that is af dependent. */
2861 		1,    /* timers active (only the expire timer) */
2862 		jiffies_delta_to_clock_t(delta),
2863 		req->num_timeout,
2864 		from_kuid_munged(seq_user_ns(f),
2865 				 sock_i_uid(req->rsk_listener)),
2866 		0,  /* non standard timer */
2867 		0, /* open_requests have no inode */
2868 		0,
2869 		req);
2870 }
2871 
2872 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2873 {
2874 	int timer_active;
2875 	unsigned long timer_expires;
2876 	const struct tcp_sock *tp = tcp_sk(sk);
2877 	const struct inet_connection_sock *icsk = inet_csk(sk);
2878 	const struct inet_sock *inet = inet_sk(sk);
2879 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2880 	__be32 dest = inet->inet_daddr;
2881 	__be32 src = inet->inet_rcv_saddr;
2882 	__u16 destp = ntohs(inet->inet_dport);
2883 	__u16 srcp = ntohs(inet->inet_sport);
2884 	int rx_queue;
2885 	int state;
2886 
2887 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2888 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2889 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2890 		timer_active	= 1;
2891 		timer_expires	= icsk->icsk_timeout;
2892 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2893 		timer_active	= 4;
2894 		timer_expires	= icsk->icsk_timeout;
2895 	} else if (timer_pending(&sk->sk_timer)) {
2896 		timer_active	= 2;
2897 		timer_expires	= sk->sk_timer.expires;
2898 	} else {
2899 		timer_active	= 0;
2900 		timer_expires = jiffies;
2901 	}
2902 
2903 	state = inet_sk_state_load(sk);
2904 	if (state == TCP_LISTEN)
2905 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2906 	else
2907 		/* Because we don't lock the socket,
2908 		 * we might find a transient negative value.
2909 		 */
2910 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2911 				      READ_ONCE(tp->copied_seq), 0);
2912 
2913 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2914 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2915 		i, src, srcp, dest, destp, state,
2916 		READ_ONCE(tp->write_seq) - tp->snd_una,
2917 		rx_queue,
2918 		timer_active,
2919 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2920 		icsk->icsk_retransmits,
2921 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2922 		icsk->icsk_probes_out,
2923 		sock_i_ino(sk),
2924 		refcount_read(&sk->sk_refcnt), sk,
2925 		jiffies_to_clock_t(icsk->icsk_rto),
2926 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2927 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2928 		tcp_snd_cwnd(tp),
2929 		state == TCP_LISTEN ?
2930 		    fastopenq->max_qlen :
2931 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2932 }
2933 
2934 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2935 			       struct seq_file *f, int i)
2936 {
2937 	long delta = tw->tw_timer.expires - jiffies;
2938 	__be32 dest, src;
2939 	__u16 destp, srcp;
2940 
2941 	dest  = tw->tw_daddr;
2942 	src   = tw->tw_rcv_saddr;
2943 	destp = ntohs(tw->tw_dport);
2944 	srcp  = ntohs(tw->tw_sport);
2945 
2946 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2947 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2948 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2949 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2950 		refcount_read(&tw->tw_refcnt), tw);
2951 }
2952 
2953 #define TMPSZ 150
2954 
2955 static int tcp4_seq_show(struct seq_file *seq, void *v)
2956 {
2957 	struct tcp_iter_state *st;
2958 	struct sock *sk = v;
2959 
2960 	seq_setwidth(seq, TMPSZ - 1);
2961 	if (v == SEQ_START_TOKEN) {
2962 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2963 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2964 			   "inode");
2965 		goto out;
2966 	}
2967 	st = seq->private;
2968 
2969 	if (sk->sk_state == TCP_TIME_WAIT)
2970 		get_timewait4_sock(v, seq, st->num);
2971 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2972 		get_openreq4(v, seq, st->num);
2973 	else
2974 		get_tcp4_sock(v, seq, st->num);
2975 out:
2976 	seq_pad(seq, '\n');
2977 	return 0;
2978 }
2979 
2980 #ifdef CONFIG_BPF_SYSCALL
2981 struct bpf_tcp_iter_state {
2982 	struct tcp_iter_state state;
2983 	unsigned int cur_sk;
2984 	unsigned int end_sk;
2985 	unsigned int max_sk;
2986 	struct sock **batch;
2987 	bool st_bucket_done;
2988 };
2989 
2990 struct bpf_iter__tcp {
2991 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2992 	__bpf_md_ptr(struct sock_common *, sk_common);
2993 	uid_t uid __aligned(8);
2994 };
2995 
2996 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2997 			     struct sock_common *sk_common, uid_t uid)
2998 {
2999 	struct bpf_iter__tcp ctx;
3000 
3001 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3002 	ctx.meta = meta;
3003 	ctx.sk_common = sk_common;
3004 	ctx.uid = uid;
3005 	return bpf_iter_run_prog(prog, &ctx);
3006 }
3007 
3008 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3009 {
3010 	while (iter->cur_sk < iter->end_sk)
3011 		sock_gen_put(iter->batch[iter->cur_sk++]);
3012 }
3013 
3014 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3015 				      unsigned int new_batch_sz)
3016 {
3017 	struct sock **new_batch;
3018 
3019 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3020 			     GFP_USER | __GFP_NOWARN);
3021 	if (!new_batch)
3022 		return -ENOMEM;
3023 
3024 	bpf_iter_tcp_put_batch(iter);
3025 	kvfree(iter->batch);
3026 	iter->batch = new_batch;
3027 	iter->max_sk = new_batch_sz;
3028 
3029 	return 0;
3030 }
3031 
3032 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3033 						 struct sock *start_sk)
3034 {
3035 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3036 	struct bpf_tcp_iter_state *iter = seq->private;
3037 	struct tcp_iter_state *st = &iter->state;
3038 	struct hlist_nulls_node *node;
3039 	unsigned int expected = 1;
3040 	struct sock *sk;
3041 
3042 	sock_hold(start_sk);
3043 	iter->batch[iter->end_sk++] = start_sk;
3044 
3045 	sk = sk_nulls_next(start_sk);
3046 	sk_nulls_for_each_from(sk, node) {
3047 		if (seq_sk_match(seq, sk)) {
3048 			if (iter->end_sk < iter->max_sk) {
3049 				sock_hold(sk);
3050 				iter->batch[iter->end_sk++] = sk;
3051 			}
3052 			expected++;
3053 		}
3054 	}
3055 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3056 
3057 	return expected;
3058 }
3059 
3060 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3061 						   struct sock *start_sk)
3062 {
3063 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3064 	struct bpf_tcp_iter_state *iter = seq->private;
3065 	struct tcp_iter_state *st = &iter->state;
3066 	struct hlist_nulls_node *node;
3067 	unsigned int expected = 1;
3068 	struct sock *sk;
3069 
3070 	sock_hold(start_sk);
3071 	iter->batch[iter->end_sk++] = start_sk;
3072 
3073 	sk = sk_nulls_next(start_sk);
3074 	sk_nulls_for_each_from(sk, node) {
3075 		if (seq_sk_match(seq, sk)) {
3076 			if (iter->end_sk < iter->max_sk) {
3077 				sock_hold(sk);
3078 				iter->batch[iter->end_sk++] = sk;
3079 			}
3080 			expected++;
3081 		}
3082 	}
3083 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3084 
3085 	return expected;
3086 }
3087 
3088 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3089 {
3090 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3091 	struct bpf_tcp_iter_state *iter = seq->private;
3092 	struct tcp_iter_state *st = &iter->state;
3093 	unsigned int expected;
3094 	bool resized = false;
3095 	struct sock *sk;
3096 
3097 	/* The st->bucket is done.  Directly advance to the next
3098 	 * bucket instead of having the tcp_seek_last_pos() to skip
3099 	 * one by one in the current bucket and eventually find out
3100 	 * it has to advance to the next bucket.
3101 	 */
3102 	if (iter->st_bucket_done) {
3103 		st->offset = 0;
3104 		st->bucket++;
3105 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3106 		    st->bucket > hinfo->lhash2_mask) {
3107 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3108 			st->bucket = 0;
3109 		}
3110 	}
3111 
3112 again:
3113 	/* Get a new batch */
3114 	iter->cur_sk = 0;
3115 	iter->end_sk = 0;
3116 	iter->st_bucket_done = false;
3117 
3118 	sk = tcp_seek_last_pos(seq);
3119 	if (!sk)
3120 		return NULL; /* Done */
3121 
3122 	if (st->state == TCP_SEQ_STATE_LISTENING)
3123 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3124 	else
3125 		expected = bpf_iter_tcp_established_batch(seq, sk);
3126 
3127 	if (iter->end_sk == expected) {
3128 		iter->st_bucket_done = true;
3129 		return sk;
3130 	}
3131 
3132 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3133 		resized = true;
3134 		goto again;
3135 	}
3136 
3137 	return sk;
3138 }
3139 
3140 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3141 {
3142 	/* bpf iter does not support lseek, so it always
3143 	 * continue from where it was stop()-ped.
3144 	 */
3145 	if (*pos)
3146 		return bpf_iter_tcp_batch(seq);
3147 
3148 	return SEQ_START_TOKEN;
3149 }
3150 
3151 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3152 {
3153 	struct bpf_tcp_iter_state *iter = seq->private;
3154 	struct tcp_iter_state *st = &iter->state;
3155 	struct sock *sk;
3156 
3157 	/* Whenever seq_next() is called, the iter->cur_sk is
3158 	 * done with seq_show(), so advance to the next sk in
3159 	 * the batch.
3160 	 */
3161 	if (iter->cur_sk < iter->end_sk) {
3162 		/* Keeping st->num consistent in tcp_iter_state.
3163 		 * bpf_iter_tcp does not use st->num.
3164 		 * meta.seq_num is used instead.
3165 		 */
3166 		st->num++;
3167 		/* Move st->offset to the next sk in the bucket such that
3168 		 * the future start() will resume at st->offset in
3169 		 * st->bucket.  See tcp_seek_last_pos().
3170 		 */
3171 		st->offset++;
3172 		sock_gen_put(iter->batch[iter->cur_sk++]);
3173 	}
3174 
3175 	if (iter->cur_sk < iter->end_sk)
3176 		sk = iter->batch[iter->cur_sk];
3177 	else
3178 		sk = bpf_iter_tcp_batch(seq);
3179 
3180 	++*pos;
3181 	/* Keeping st->last_pos consistent in tcp_iter_state.
3182 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3183 	 */
3184 	st->last_pos = *pos;
3185 	return sk;
3186 }
3187 
3188 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3189 {
3190 	struct bpf_iter_meta meta;
3191 	struct bpf_prog *prog;
3192 	struct sock *sk = v;
3193 	uid_t uid;
3194 	int ret;
3195 
3196 	if (v == SEQ_START_TOKEN)
3197 		return 0;
3198 
3199 	if (sk_fullsock(sk))
3200 		lock_sock(sk);
3201 
3202 	if (unlikely(sk_unhashed(sk))) {
3203 		ret = SEQ_SKIP;
3204 		goto unlock;
3205 	}
3206 
3207 	if (sk->sk_state == TCP_TIME_WAIT) {
3208 		uid = 0;
3209 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3210 		const struct request_sock *req = v;
3211 
3212 		uid = from_kuid_munged(seq_user_ns(seq),
3213 				       sock_i_uid(req->rsk_listener));
3214 	} else {
3215 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3216 	}
3217 
3218 	meta.seq = seq;
3219 	prog = bpf_iter_get_info(&meta, false);
3220 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3221 
3222 unlock:
3223 	if (sk_fullsock(sk))
3224 		release_sock(sk);
3225 	return ret;
3226 
3227 }
3228 
3229 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3230 {
3231 	struct bpf_tcp_iter_state *iter = seq->private;
3232 	struct bpf_iter_meta meta;
3233 	struct bpf_prog *prog;
3234 
3235 	if (!v) {
3236 		meta.seq = seq;
3237 		prog = bpf_iter_get_info(&meta, true);
3238 		if (prog)
3239 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3240 	}
3241 
3242 	if (iter->cur_sk < iter->end_sk) {
3243 		bpf_iter_tcp_put_batch(iter);
3244 		iter->st_bucket_done = false;
3245 	}
3246 }
3247 
3248 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3249 	.show		= bpf_iter_tcp_seq_show,
3250 	.start		= bpf_iter_tcp_seq_start,
3251 	.next		= bpf_iter_tcp_seq_next,
3252 	.stop		= bpf_iter_tcp_seq_stop,
3253 };
3254 #endif
3255 static unsigned short seq_file_family(const struct seq_file *seq)
3256 {
3257 	const struct tcp_seq_afinfo *afinfo;
3258 
3259 #ifdef CONFIG_BPF_SYSCALL
3260 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3261 	if (seq->op == &bpf_iter_tcp_seq_ops)
3262 		return AF_UNSPEC;
3263 #endif
3264 
3265 	/* Iterated from proc fs */
3266 	afinfo = pde_data(file_inode(seq->file));
3267 	return afinfo->family;
3268 }
3269 
3270 static const struct seq_operations tcp4_seq_ops = {
3271 	.show		= tcp4_seq_show,
3272 	.start		= tcp_seq_start,
3273 	.next		= tcp_seq_next,
3274 	.stop		= tcp_seq_stop,
3275 };
3276 
3277 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3278 	.family		= AF_INET,
3279 };
3280 
3281 static int __net_init tcp4_proc_init_net(struct net *net)
3282 {
3283 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3284 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3285 		return -ENOMEM;
3286 	return 0;
3287 }
3288 
3289 static void __net_exit tcp4_proc_exit_net(struct net *net)
3290 {
3291 	remove_proc_entry("tcp", net->proc_net);
3292 }
3293 
3294 static struct pernet_operations tcp4_net_ops = {
3295 	.init = tcp4_proc_init_net,
3296 	.exit = tcp4_proc_exit_net,
3297 };
3298 
3299 int __init tcp4_proc_init(void)
3300 {
3301 	return register_pernet_subsys(&tcp4_net_ops);
3302 }
3303 
3304 void tcp4_proc_exit(void)
3305 {
3306 	unregister_pernet_subsys(&tcp4_net_ops);
3307 }
3308 #endif /* CONFIG_PROC_FS */
3309 
3310 /* @wake is one when sk_stream_write_space() calls us.
3311  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3312  * This mimics the strategy used in sock_def_write_space().
3313  */
3314 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3315 {
3316 	const struct tcp_sock *tp = tcp_sk(sk);
3317 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3318 			    READ_ONCE(tp->snd_nxt);
3319 
3320 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3321 }
3322 EXPORT_SYMBOL(tcp_stream_memory_free);
3323 
3324 struct proto tcp_prot = {
3325 	.name			= "TCP",
3326 	.owner			= THIS_MODULE,
3327 	.close			= tcp_close,
3328 	.pre_connect		= tcp_v4_pre_connect,
3329 	.connect		= tcp_v4_connect,
3330 	.disconnect		= tcp_disconnect,
3331 	.accept			= inet_csk_accept,
3332 	.ioctl			= tcp_ioctl,
3333 	.init			= tcp_v4_init_sock,
3334 	.destroy		= tcp_v4_destroy_sock,
3335 	.shutdown		= tcp_shutdown,
3336 	.setsockopt		= tcp_setsockopt,
3337 	.getsockopt		= tcp_getsockopt,
3338 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3339 	.keepalive		= tcp_set_keepalive,
3340 	.recvmsg		= tcp_recvmsg,
3341 	.sendmsg		= tcp_sendmsg,
3342 	.splice_eof		= tcp_splice_eof,
3343 	.backlog_rcv		= tcp_v4_do_rcv,
3344 	.release_cb		= tcp_release_cb,
3345 	.hash			= inet_hash,
3346 	.unhash			= inet_unhash,
3347 	.get_port		= inet_csk_get_port,
3348 	.put_port		= inet_put_port,
3349 #ifdef CONFIG_BPF_SYSCALL
3350 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3351 #endif
3352 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3353 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3354 	.stream_memory_free	= tcp_stream_memory_free,
3355 	.sockets_allocated	= &tcp_sockets_allocated,
3356 	.orphan_count		= &tcp_orphan_count,
3357 
3358 	.memory_allocated	= &tcp_memory_allocated,
3359 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3360 
3361 	.memory_pressure	= &tcp_memory_pressure,
3362 	.sysctl_mem		= sysctl_tcp_mem,
3363 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3364 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3365 	.max_header		= MAX_TCP_HEADER,
3366 	.obj_size		= sizeof(struct tcp_sock),
3367 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3368 	.twsk_prot		= &tcp_timewait_sock_ops,
3369 	.rsk_prot		= &tcp_request_sock_ops,
3370 	.h.hashinfo		= NULL,
3371 	.no_autobind		= true,
3372 	.diag_destroy		= tcp_abort,
3373 };
3374 EXPORT_SYMBOL(tcp_prot);
3375 
3376 static void __net_exit tcp_sk_exit(struct net *net)
3377 {
3378 	if (net->ipv4.tcp_congestion_control)
3379 		bpf_module_put(net->ipv4.tcp_congestion_control,
3380 			       net->ipv4.tcp_congestion_control->owner);
3381 }
3382 
3383 static void __net_init tcp_set_hashinfo(struct net *net)
3384 {
3385 	struct inet_hashinfo *hinfo;
3386 	unsigned int ehash_entries;
3387 	struct net *old_net;
3388 
3389 	if (net_eq(net, &init_net))
3390 		goto fallback;
3391 
3392 	old_net = current->nsproxy->net_ns;
3393 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3394 	if (!ehash_entries)
3395 		goto fallback;
3396 
3397 	ehash_entries = roundup_pow_of_two(ehash_entries);
3398 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3399 	if (!hinfo) {
3400 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3401 			"for a netns, fallback to the global one\n",
3402 			ehash_entries);
3403 fallback:
3404 		hinfo = &tcp_hashinfo;
3405 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3406 	}
3407 
3408 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3409 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3410 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3411 }
3412 
3413 static int __net_init tcp_sk_init(struct net *net)
3414 {
3415 	net->ipv4.sysctl_tcp_ecn = 2;
3416 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3417 
3418 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3419 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3420 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3421 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3422 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3423 
3424 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3425 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3426 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3427 
3428 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3429 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3430 	net->ipv4.sysctl_tcp_syncookies = 1;
3431 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3432 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3433 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3434 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3435 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3436 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3437 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3438 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3439 
3440 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3441 	tcp_set_hashinfo(net);
3442 
3443 	net->ipv4.sysctl_tcp_sack = 1;
3444 	net->ipv4.sysctl_tcp_window_scaling = 1;
3445 	net->ipv4.sysctl_tcp_timestamps = 1;
3446 	net->ipv4.sysctl_tcp_early_retrans = 3;
3447 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3448 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3449 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3450 	net->ipv4.sysctl_tcp_max_reordering = 300;
3451 	net->ipv4.sysctl_tcp_dsack = 1;
3452 	net->ipv4.sysctl_tcp_app_win = 31;
3453 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3454 	net->ipv4.sysctl_tcp_frto = 2;
3455 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3456 	/* This limits the percentage of the congestion window which we
3457 	 * will allow a single TSO frame to consume.  Building TSO frames
3458 	 * which are too large can cause TCP streams to be bursty.
3459 	 */
3460 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3461 	/* Default TSQ limit of 16 TSO segments */
3462 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3463 
3464 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3465 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3466 
3467 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3468 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3469 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3470 	net->ipv4.sysctl_tcp_autocorking = 1;
3471 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3472 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3473 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3474 	if (net != &init_net) {
3475 		memcpy(net->ipv4.sysctl_tcp_rmem,
3476 		       init_net.ipv4.sysctl_tcp_rmem,
3477 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3478 		memcpy(net->ipv4.sysctl_tcp_wmem,
3479 		       init_net.ipv4.sysctl_tcp_wmem,
3480 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3481 	}
3482 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3483 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3484 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3485 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3486 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3487 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3488 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3489 
3490 	/* Set default values for PLB */
3491 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3492 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3493 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3494 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3495 	/* Default congestion threshold for PLB to mark a round is 50% */
3496 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3497 
3498 	/* Reno is always built in */
3499 	if (!net_eq(net, &init_net) &&
3500 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3501 			       init_net.ipv4.tcp_congestion_control->owner))
3502 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3503 	else
3504 		net->ipv4.tcp_congestion_control = &tcp_reno;
3505 
3506 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3507 	net->ipv4.sysctl_tcp_shrink_window = 0;
3508 
3509 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3510 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3511 
3512 	return 0;
3513 }
3514 
3515 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3516 {
3517 	struct net *net;
3518 
3519 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3520 	 * and failed setup_net error unwinding path are serialized.
3521 	 *
3522 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3523 	 * net_exit_list, the thread that dismantles a particular twsk must
3524 	 * do so without other thread progressing to refcount_dec_and_test() of
3525 	 * tcp_death_row.tw_refcount.
3526 	 */
3527 	mutex_lock(&tcp_exit_batch_mutex);
3528 
3529 	tcp_twsk_purge(net_exit_list);
3530 
3531 	list_for_each_entry(net, net_exit_list, exit_list) {
3532 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3533 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3534 		tcp_fastopen_ctx_destroy(net);
3535 	}
3536 
3537 	mutex_unlock(&tcp_exit_batch_mutex);
3538 }
3539 
3540 static struct pernet_operations __net_initdata tcp_sk_ops = {
3541        .init	   = tcp_sk_init,
3542        .exit	   = tcp_sk_exit,
3543        .exit_batch = tcp_sk_exit_batch,
3544 };
3545 
3546 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3547 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3548 		     struct sock_common *sk_common, uid_t uid)
3549 
3550 #define INIT_BATCH_SZ 16
3551 
3552 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3553 {
3554 	struct bpf_tcp_iter_state *iter = priv_data;
3555 	int err;
3556 
3557 	err = bpf_iter_init_seq_net(priv_data, aux);
3558 	if (err)
3559 		return err;
3560 
3561 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3562 	if (err) {
3563 		bpf_iter_fini_seq_net(priv_data);
3564 		return err;
3565 	}
3566 
3567 	return 0;
3568 }
3569 
3570 static void bpf_iter_fini_tcp(void *priv_data)
3571 {
3572 	struct bpf_tcp_iter_state *iter = priv_data;
3573 
3574 	bpf_iter_fini_seq_net(priv_data);
3575 	kvfree(iter->batch);
3576 }
3577 
3578 static const struct bpf_iter_seq_info tcp_seq_info = {
3579 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3580 	.init_seq_private	= bpf_iter_init_tcp,
3581 	.fini_seq_private	= bpf_iter_fini_tcp,
3582 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3583 };
3584 
3585 static const struct bpf_func_proto *
3586 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3587 			    const struct bpf_prog *prog)
3588 {
3589 	switch (func_id) {
3590 	case BPF_FUNC_setsockopt:
3591 		return &bpf_sk_setsockopt_proto;
3592 	case BPF_FUNC_getsockopt:
3593 		return &bpf_sk_getsockopt_proto;
3594 	default:
3595 		return NULL;
3596 	}
3597 }
3598 
3599 static struct bpf_iter_reg tcp_reg_info = {
3600 	.target			= "tcp",
3601 	.ctx_arg_info_size	= 1,
3602 	.ctx_arg_info		= {
3603 		{ offsetof(struct bpf_iter__tcp, sk_common),
3604 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3605 	},
3606 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3607 	.seq_info		= &tcp_seq_info,
3608 };
3609 
3610 static void __init bpf_iter_register(void)
3611 {
3612 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3613 	if (bpf_iter_reg_target(&tcp_reg_info))
3614 		pr_warn("Warning: could not register bpf iterator tcp\n");
3615 }
3616 
3617 #endif
3618 
3619 void __init tcp_v4_init(void)
3620 {
3621 	int cpu, res;
3622 
3623 	for_each_possible_cpu(cpu) {
3624 		struct sock *sk;
3625 
3626 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3627 					   IPPROTO_TCP, &init_net);
3628 		if (res)
3629 			panic("Failed to create the TCP control socket.\n");
3630 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3631 
3632 		/* Please enforce IP_DF and IPID==0 for RST and
3633 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3634 		 */
3635 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3636 
3637 		sk->sk_clockid = CLOCK_MONOTONIC;
3638 
3639 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3640 	}
3641 	if (register_pernet_subsys(&tcp_sk_ops))
3642 		panic("Failed to create the TCP control socket.\n");
3643 
3644 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3645 	bpf_iter_register();
3646 #endif
3647 }
3648