xref: /linux/net/ipv4/tcp_ipv4.c (revision 3ff78451b8e446e9a548b98a0d4dd8d24dc5780b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74 
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 
83 #include <crypto/hash.h>
84 #include <linux/scatterlist.h>
85 
86 #include <trace/events/tcp.h>
87 
88 #ifdef CONFIG_TCP_MD5SIG
89 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
90 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
91 #endif
92 
93 struct inet_hashinfo tcp_hashinfo;
94 EXPORT_SYMBOL(tcp_hashinfo);
95 
96 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
97 
98 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99 {
100 	return secure_tcp_seq(ip_hdr(skb)->daddr,
101 			      ip_hdr(skb)->saddr,
102 			      tcp_hdr(skb)->dest,
103 			      tcp_hdr(skb)->source);
104 }
105 
106 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107 {
108 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109 }
110 
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
114 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
115 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
116 	struct tcp_sock *tp = tcp_sk(sk);
117 
118 	if (reuse == 2) {
119 		/* Still does not detect *everything* that goes through
120 		 * lo, since we require a loopback src or dst address
121 		 * or direct binding to 'lo' interface.
122 		 */
123 		bool loopback = false;
124 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
125 			loopback = true;
126 #if IS_ENABLED(CONFIG_IPV6)
127 		if (tw->tw_family == AF_INET6) {
128 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
130 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
131 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
132 				loopback = true;
133 		} else
134 #endif
135 		{
136 			if (ipv4_is_loopback(tw->tw_daddr) ||
137 			    ipv4_is_loopback(tw->tw_rcv_saddr))
138 				loopback = true;
139 		}
140 		if (!loopback)
141 			reuse = 0;
142 	}
143 
144 	/* With PAWS, it is safe from the viewpoint
145 	   of data integrity. Even without PAWS it is safe provided sequence
146 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
147 
148 	   Actually, the idea is close to VJ's one, only timestamp cache is
149 	   held not per host, but per port pair and TW bucket is used as state
150 	   holder.
151 
152 	   If TW bucket has been already destroyed we fall back to VJ's scheme
153 	   and use initial timestamp retrieved from peer table.
154 	 */
155 	if (tcptw->tw_ts_recent_stamp &&
156 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
157 					    tcptw->tw_ts_recent_stamp)))) {
158 		/* inet_twsk_hashdance() sets sk_refcnt after putting twsk
159 		 * and releasing the bucket lock.
160 		 */
161 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
162 			return 0;
163 
164 		/* In case of repair and re-using TIME-WAIT sockets we still
165 		 * want to be sure that it is safe as above but honor the
166 		 * sequence numbers and time stamps set as part of the repair
167 		 * process.
168 		 *
169 		 * Without this check re-using a TIME-WAIT socket with TCP
170 		 * repair would accumulate a -1 on the repair assigned
171 		 * sequence number. The first time it is reused the sequence
172 		 * is -1, the second time -2, etc. This fixes that issue
173 		 * without appearing to create any others.
174 		 */
175 		if (likely(!tp->repair)) {
176 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
177 
178 			if (!seq)
179 				seq = 1;
180 			WRITE_ONCE(tp->write_seq, seq);
181 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
182 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
183 		}
184 
185 		return 1;
186 	}
187 
188 	return 0;
189 }
190 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
191 
192 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
193 			      int addr_len)
194 {
195 	/* This check is replicated from tcp_v4_connect() and intended to
196 	 * prevent BPF program called below from accessing bytes that are out
197 	 * of the bound specified by user in addr_len.
198 	 */
199 	if (addr_len < sizeof(struct sockaddr_in))
200 		return -EINVAL;
201 
202 	sock_owned_by_me(sk);
203 
204 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
205 }
206 
207 /* This will initiate an outgoing connection. */
208 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
209 {
210 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
211 	struct inet_timewait_death_row *tcp_death_row;
212 	struct inet_sock *inet = inet_sk(sk);
213 	struct tcp_sock *tp = tcp_sk(sk);
214 	struct ip_options_rcu *inet_opt;
215 	struct net *net = sock_net(sk);
216 	__be16 orig_sport, orig_dport;
217 	__be32 daddr, nexthop;
218 	struct flowi4 *fl4;
219 	struct rtable *rt;
220 	int err;
221 
222 	if (addr_len < sizeof(struct sockaddr_in))
223 		return -EINVAL;
224 
225 	if (usin->sin_family != AF_INET)
226 		return -EAFNOSUPPORT;
227 
228 	nexthop = daddr = usin->sin_addr.s_addr;
229 	inet_opt = rcu_dereference_protected(inet->inet_opt,
230 					     lockdep_sock_is_held(sk));
231 	if (inet_opt && inet_opt->opt.srr) {
232 		if (!daddr)
233 			return -EINVAL;
234 		nexthop = inet_opt->opt.faddr;
235 	}
236 
237 	orig_sport = inet->inet_sport;
238 	orig_dport = usin->sin_port;
239 	fl4 = &inet->cork.fl.u.ip4;
240 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
241 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
242 			      orig_dport, sk);
243 	if (IS_ERR(rt)) {
244 		err = PTR_ERR(rt);
245 		if (err == -ENETUNREACH)
246 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
247 		return err;
248 	}
249 
250 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
251 		ip_rt_put(rt);
252 		return -ENETUNREACH;
253 	}
254 
255 	if (!inet_opt || !inet_opt->opt.srr)
256 		daddr = fl4->daddr;
257 
258 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
259 
260 	if (!inet->inet_saddr) {
261 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
262 		if (err) {
263 			ip_rt_put(rt);
264 			return err;
265 		}
266 	} else {
267 		sk_rcv_saddr_set(sk, inet->inet_saddr);
268 	}
269 
270 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
271 		/* Reset inherited state */
272 		tp->rx_opt.ts_recent	   = 0;
273 		tp->rx_opt.ts_recent_stamp = 0;
274 		if (likely(!tp->repair))
275 			WRITE_ONCE(tp->write_seq, 0);
276 	}
277 
278 	inet->inet_dport = usin->sin_port;
279 	sk_daddr_set(sk, daddr);
280 
281 	inet_csk(sk)->icsk_ext_hdr_len = 0;
282 	if (inet_opt)
283 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
284 
285 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
286 
287 	/* Socket identity is still unknown (sport may be zero).
288 	 * However we set state to SYN-SENT and not releasing socket
289 	 * lock select source port, enter ourselves into the hash tables and
290 	 * complete initialization after this.
291 	 */
292 	tcp_set_state(sk, TCP_SYN_SENT);
293 	err = inet_hash_connect(tcp_death_row, sk);
294 	if (err)
295 		goto failure;
296 
297 	sk_set_txhash(sk);
298 
299 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
300 			       inet->inet_sport, inet->inet_dport, sk);
301 	if (IS_ERR(rt)) {
302 		err = PTR_ERR(rt);
303 		rt = NULL;
304 		goto failure;
305 	}
306 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
307 	/* OK, now commit destination to socket.  */
308 	sk->sk_gso_type = SKB_GSO_TCPV4;
309 	sk_setup_caps(sk, &rt->dst);
310 	rt = NULL;
311 
312 	if (likely(!tp->repair)) {
313 		if (!tp->write_seq)
314 			WRITE_ONCE(tp->write_seq,
315 				   secure_tcp_seq(inet->inet_saddr,
316 						  inet->inet_daddr,
317 						  inet->inet_sport,
318 						  usin->sin_port));
319 		WRITE_ONCE(tp->tsoffset,
320 			   secure_tcp_ts_off(net, inet->inet_saddr,
321 					     inet->inet_daddr));
322 	}
323 
324 	atomic_set(&inet->inet_id, get_random_u16());
325 
326 	if (tcp_fastopen_defer_connect(sk, &err))
327 		return err;
328 	if (err)
329 		goto failure;
330 
331 	err = tcp_connect(sk);
332 
333 	if (err)
334 		goto failure;
335 
336 	return 0;
337 
338 failure:
339 	/*
340 	 * This unhashes the socket and releases the local port,
341 	 * if necessary.
342 	 */
343 	tcp_set_state(sk, TCP_CLOSE);
344 	inet_bhash2_reset_saddr(sk);
345 	ip_rt_put(rt);
346 	sk->sk_route_caps = 0;
347 	inet->inet_dport = 0;
348 	return err;
349 }
350 EXPORT_SYMBOL(tcp_v4_connect);
351 
352 /*
353  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
354  * It can be called through tcp_release_cb() if socket was owned by user
355  * at the time tcp_v4_err() was called to handle ICMP message.
356  */
357 void tcp_v4_mtu_reduced(struct sock *sk)
358 {
359 	struct inet_sock *inet = inet_sk(sk);
360 	struct dst_entry *dst;
361 	u32 mtu;
362 
363 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
364 		return;
365 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
366 	dst = inet_csk_update_pmtu(sk, mtu);
367 	if (!dst)
368 		return;
369 
370 	/* Something is about to be wrong... Remember soft error
371 	 * for the case, if this connection will not able to recover.
372 	 */
373 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
374 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
375 
376 	mtu = dst_mtu(dst);
377 
378 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
379 	    ip_sk_accept_pmtu(sk) &&
380 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
381 		tcp_sync_mss(sk, mtu);
382 
383 		/* Resend the TCP packet because it's
384 		 * clear that the old packet has been
385 		 * dropped. This is the new "fast" path mtu
386 		 * discovery.
387 		 */
388 		tcp_simple_retransmit(sk);
389 	} /* else let the usual retransmit timer handle it */
390 }
391 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
392 
393 static void do_redirect(struct sk_buff *skb, struct sock *sk)
394 {
395 	struct dst_entry *dst = __sk_dst_check(sk, 0);
396 
397 	if (dst)
398 		dst->ops->redirect(dst, sk, skb);
399 }
400 
401 
402 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
403 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
404 {
405 	struct request_sock *req = inet_reqsk(sk);
406 	struct net *net = sock_net(sk);
407 
408 	/* ICMPs are not backlogged, hence we cannot get
409 	 * an established socket here.
410 	 */
411 	if (seq != tcp_rsk(req)->snt_isn) {
412 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
413 	} else if (abort) {
414 		/*
415 		 * Still in SYN_RECV, just remove it silently.
416 		 * There is no good way to pass the error to the newly
417 		 * created socket, and POSIX does not want network
418 		 * errors returned from accept().
419 		 */
420 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
421 		tcp_listendrop(req->rsk_listener);
422 	}
423 	reqsk_put(req);
424 }
425 EXPORT_SYMBOL(tcp_req_err);
426 
427 /* TCP-LD (RFC 6069) logic */
428 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
429 {
430 	struct inet_connection_sock *icsk = inet_csk(sk);
431 	struct tcp_sock *tp = tcp_sk(sk);
432 	struct sk_buff *skb;
433 	s32 remaining;
434 	u32 delta_us;
435 
436 	if (sock_owned_by_user(sk))
437 		return;
438 
439 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
440 	    !icsk->icsk_backoff)
441 		return;
442 
443 	skb = tcp_rtx_queue_head(sk);
444 	if (WARN_ON_ONCE(!skb))
445 		return;
446 
447 	icsk->icsk_backoff--;
448 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
449 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
450 
451 	tcp_mstamp_refresh(tp);
452 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
453 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
454 
455 	if (remaining > 0) {
456 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
457 					  remaining, TCP_RTO_MAX);
458 	} else {
459 		/* RTO revert clocked out retransmission.
460 		 * Will retransmit now.
461 		 */
462 		tcp_retransmit_timer(sk);
463 	}
464 }
465 EXPORT_SYMBOL(tcp_ld_RTO_revert);
466 
467 /*
468  * This routine is called by the ICMP module when it gets some
469  * sort of error condition.  If err < 0 then the socket should
470  * be closed and the error returned to the user.  If err > 0
471  * it's just the icmp type << 8 | icmp code.  After adjustment
472  * header points to the first 8 bytes of the tcp header.  We need
473  * to find the appropriate port.
474  *
475  * The locking strategy used here is very "optimistic". When
476  * someone else accesses the socket the ICMP is just dropped
477  * and for some paths there is no check at all.
478  * A more general error queue to queue errors for later handling
479  * is probably better.
480  *
481  */
482 
483 int tcp_v4_err(struct sk_buff *skb, u32 info)
484 {
485 	const struct iphdr *iph = (const struct iphdr *)skb->data;
486 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
487 	struct tcp_sock *tp;
488 	const int type = icmp_hdr(skb)->type;
489 	const int code = icmp_hdr(skb)->code;
490 	struct sock *sk;
491 	struct request_sock *fastopen;
492 	u32 seq, snd_una;
493 	int err;
494 	struct net *net = dev_net(skb->dev);
495 
496 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
497 				       iph->daddr, th->dest, iph->saddr,
498 				       ntohs(th->source), inet_iif(skb), 0);
499 	if (!sk) {
500 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
501 		return -ENOENT;
502 	}
503 	if (sk->sk_state == TCP_TIME_WAIT) {
504 		/* To increase the counter of ignored icmps for TCP-AO */
505 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
506 		inet_twsk_put(inet_twsk(sk));
507 		return 0;
508 	}
509 	seq = ntohl(th->seq);
510 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
511 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
512 				     type == ICMP_TIME_EXCEEDED ||
513 				     (type == ICMP_DEST_UNREACH &&
514 				      (code == ICMP_NET_UNREACH ||
515 				       code == ICMP_HOST_UNREACH)));
516 		return 0;
517 	}
518 
519 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
520 		sock_put(sk);
521 		return 0;
522 	}
523 
524 	bh_lock_sock(sk);
525 	/* If too many ICMPs get dropped on busy
526 	 * servers this needs to be solved differently.
527 	 * We do take care of PMTU discovery (RFC1191) special case :
528 	 * we can receive locally generated ICMP messages while socket is held.
529 	 */
530 	if (sock_owned_by_user(sk)) {
531 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
532 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
533 	}
534 	if (sk->sk_state == TCP_CLOSE)
535 		goto out;
536 
537 	if (static_branch_unlikely(&ip4_min_ttl)) {
538 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
539 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
540 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
541 			goto out;
542 		}
543 	}
544 
545 	tp = tcp_sk(sk);
546 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
547 	fastopen = rcu_dereference(tp->fastopen_rsk);
548 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
549 	if (sk->sk_state != TCP_LISTEN &&
550 	    !between(seq, snd_una, tp->snd_nxt)) {
551 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
552 		goto out;
553 	}
554 
555 	switch (type) {
556 	case ICMP_REDIRECT:
557 		if (!sock_owned_by_user(sk))
558 			do_redirect(skb, sk);
559 		goto out;
560 	case ICMP_SOURCE_QUENCH:
561 		/* Just silently ignore these. */
562 		goto out;
563 	case ICMP_PARAMETERPROB:
564 		err = EPROTO;
565 		break;
566 	case ICMP_DEST_UNREACH:
567 		if (code > NR_ICMP_UNREACH)
568 			goto out;
569 
570 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
571 			/* We are not interested in TCP_LISTEN and open_requests
572 			 * (SYN-ACKs send out by Linux are always <576bytes so
573 			 * they should go through unfragmented).
574 			 */
575 			if (sk->sk_state == TCP_LISTEN)
576 				goto out;
577 
578 			WRITE_ONCE(tp->mtu_info, info);
579 			if (!sock_owned_by_user(sk)) {
580 				tcp_v4_mtu_reduced(sk);
581 			} else {
582 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
583 					sock_hold(sk);
584 			}
585 			goto out;
586 		}
587 
588 		err = icmp_err_convert[code].errno;
589 		/* check if this ICMP message allows revert of backoff.
590 		 * (see RFC 6069)
591 		 */
592 		if (!fastopen &&
593 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
594 			tcp_ld_RTO_revert(sk, seq);
595 		break;
596 	case ICMP_TIME_EXCEEDED:
597 		err = EHOSTUNREACH;
598 		break;
599 	default:
600 		goto out;
601 	}
602 
603 	switch (sk->sk_state) {
604 	case TCP_SYN_SENT:
605 	case TCP_SYN_RECV:
606 		/* Only in fast or simultaneous open. If a fast open socket is
607 		 * already accepted it is treated as a connected one below.
608 		 */
609 		if (fastopen && !fastopen->sk)
610 			break;
611 
612 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
613 
614 		if (!sock_owned_by_user(sk)) {
615 			WRITE_ONCE(sk->sk_err, err);
616 
617 			sk_error_report(sk);
618 
619 			tcp_done(sk);
620 		} else {
621 			WRITE_ONCE(sk->sk_err_soft, err);
622 		}
623 		goto out;
624 	}
625 
626 	/* If we've already connected we will keep trying
627 	 * until we time out, or the user gives up.
628 	 *
629 	 * rfc1122 4.2.3.9 allows to consider as hard errors
630 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
631 	 * but it is obsoleted by pmtu discovery).
632 	 *
633 	 * Note, that in modern internet, where routing is unreliable
634 	 * and in each dark corner broken firewalls sit, sending random
635 	 * errors ordered by their masters even this two messages finally lose
636 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
637 	 *
638 	 * Now we are in compliance with RFCs.
639 	 *							--ANK (980905)
640 	 */
641 
642 	if (!sock_owned_by_user(sk) &&
643 	    inet_test_bit(RECVERR, sk)) {
644 		WRITE_ONCE(sk->sk_err, err);
645 		sk_error_report(sk);
646 	} else	{ /* Only an error on timeout */
647 		WRITE_ONCE(sk->sk_err_soft, err);
648 	}
649 
650 out:
651 	bh_unlock_sock(sk);
652 	sock_put(sk);
653 	return 0;
654 }
655 
656 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
657 {
658 	struct tcphdr *th = tcp_hdr(skb);
659 
660 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
661 	skb->csum_start = skb_transport_header(skb) - skb->head;
662 	skb->csum_offset = offsetof(struct tcphdr, check);
663 }
664 
665 /* This routine computes an IPv4 TCP checksum. */
666 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
667 {
668 	const struct inet_sock *inet = inet_sk(sk);
669 
670 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
671 }
672 EXPORT_SYMBOL(tcp_v4_send_check);
673 
674 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
675 
676 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
677 				 const struct tcp_ao_hdr *aoh,
678 				 struct ip_reply_arg *arg, struct tcphdr *reply,
679 				 __be32 reply_options[REPLY_OPTIONS_LEN])
680 {
681 #ifdef CONFIG_TCP_AO
682 	int sdif = tcp_v4_sdif(skb);
683 	int dif = inet_iif(skb);
684 	int l3index = sdif ? dif : 0;
685 	bool allocated_traffic_key;
686 	struct tcp_ao_key *key;
687 	char *traffic_key;
688 	bool drop = true;
689 	u32 ao_sne = 0;
690 	u8 keyid;
691 
692 	rcu_read_lock();
693 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
694 				 &key, &traffic_key, &allocated_traffic_key,
695 				 &keyid, &ao_sne))
696 		goto out;
697 
698 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
699 				 (aoh->rnext_keyid << 8) | keyid);
700 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
701 	reply->doff = arg->iov[0].iov_len / 4;
702 
703 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
704 			    key, traffic_key,
705 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
706 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
707 			    reply, ao_sne))
708 		goto out;
709 	drop = false;
710 out:
711 	rcu_read_unlock();
712 	if (allocated_traffic_key)
713 		kfree(traffic_key);
714 	return drop;
715 #else
716 	return true;
717 #endif
718 }
719 
720 /*
721  *	This routine will send an RST to the other tcp.
722  *
723  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
724  *		      for reset.
725  *	Answer: if a packet caused RST, it is not for a socket
726  *		existing in our system, if it is matched to a socket,
727  *		it is just duplicate segment or bug in other side's TCP.
728  *		So that we build reply only basing on parameters
729  *		arrived with segment.
730  *	Exception: precedence violation. We do not implement it in any case.
731  */
732 
733 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
734 			      enum sk_rst_reason reason)
735 {
736 	const struct tcphdr *th = tcp_hdr(skb);
737 	struct {
738 		struct tcphdr th;
739 		__be32 opt[REPLY_OPTIONS_LEN];
740 	} rep;
741 	const __u8 *md5_hash_location = NULL;
742 	const struct tcp_ao_hdr *aoh;
743 	struct ip_reply_arg arg;
744 #ifdef CONFIG_TCP_MD5SIG
745 	struct tcp_md5sig_key *key = NULL;
746 	unsigned char newhash[16];
747 	struct sock *sk1 = NULL;
748 	int genhash;
749 #endif
750 	u64 transmit_time = 0;
751 	struct sock *ctl_sk;
752 	struct net *net;
753 	u32 txhash = 0;
754 
755 	/* Never send a reset in response to a reset. */
756 	if (th->rst)
757 		return;
758 
759 	/* If sk not NULL, it means we did a successful lookup and incoming
760 	 * route had to be correct. prequeue might have dropped our dst.
761 	 */
762 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
763 		return;
764 
765 	/* Swap the send and the receive. */
766 	memset(&rep, 0, sizeof(rep));
767 	rep.th.dest   = th->source;
768 	rep.th.source = th->dest;
769 	rep.th.doff   = sizeof(struct tcphdr) / 4;
770 	rep.th.rst    = 1;
771 
772 	if (th->ack) {
773 		rep.th.seq = th->ack_seq;
774 	} else {
775 		rep.th.ack = 1;
776 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
777 				       skb->len - (th->doff << 2));
778 	}
779 
780 	memset(&arg, 0, sizeof(arg));
781 	arg.iov[0].iov_base = (unsigned char *)&rep;
782 	arg.iov[0].iov_len  = sizeof(rep.th);
783 
784 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
785 
786 	/* Invalid TCP option size or twice included auth */
787 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
788 		return;
789 
790 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
791 		return;
792 
793 #ifdef CONFIG_TCP_MD5SIG
794 	rcu_read_lock();
795 	if (sk && sk_fullsock(sk)) {
796 		const union tcp_md5_addr *addr;
797 		int l3index;
798 
799 		/* sdif set, means packet ingressed via a device
800 		 * in an L3 domain and inet_iif is set to it.
801 		 */
802 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
803 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
804 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
805 	} else if (md5_hash_location) {
806 		const union tcp_md5_addr *addr;
807 		int sdif = tcp_v4_sdif(skb);
808 		int dif = inet_iif(skb);
809 		int l3index;
810 
811 		/*
812 		 * active side is lost. Try to find listening socket through
813 		 * source port, and then find md5 key through listening socket.
814 		 * we are not loose security here:
815 		 * Incoming packet is checked with md5 hash with finding key,
816 		 * no RST generated if md5 hash doesn't match.
817 		 */
818 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
819 					     NULL, 0, ip_hdr(skb)->saddr,
820 					     th->source, ip_hdr(skb)->daddr,
821 					     ntohs(th->source), dif, sdif);
822 		/* don't send rst if it can't find key */
823 		if (!sk1)
824 			goto out;
825 
826 		/* sdif set, means packet ingressed via a device
827 		 * in an L3 domain and dif is set to it.
828 		 */
829 		l3index = sdif ? dif : 0;
830 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
831 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
832 		if (!key)
833 			goto out;
834 
835 
836 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
837 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
838 			goto out;
839 
840 	}
841 
842 	if (key) {
843 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
844 				   (TCPOPT_NOP << 16) |
845 				   (TCPOPT_MD5SIG << 8) |
846 				   TCPOLEN_MD5SIG);
847 		/* Update length and the length the header thinks exists */
848 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
849 		rep.th.doff = arg.iov[0].iov_len / 4;
850 
851 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
852 				     key, ip_hdr(skb)->saddr,
853 				     ip_hdr(skb)->daddr, &rep.th);
854 	}
855 #endif
856 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
857 	if (rep.opt[0] == 0) {
858 		__be32 mrst = mptcp_reset_option(skb);
859 
860 		if (mrst) {
861 			rep.opt[0] = mrst;
862 			arg.iov[0].iov_len += sizeof(mrst);
863 			rep.th.doff = arg.iov[0].iov_len / 4;
864 		}
865 	}
866 
867 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
868 				      ip_hdr(skb)->saddr, /* XXX */
869 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
870 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
871 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
872 
873 	/* When socket is gone, all binding information is lost.
874 	 * routing might fail in this case. No choice here, if we choose to force
875 	 * input interface, we will misroute in case of asymmetric route.
876 	 */
877 	if (sk)
878 		arg.bound_dev_if = sk->sk_bound_dev_if;
879 
880 	trace_tcp_send_reset(sk, skb, reason);
881 
882 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
883 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
884 
885 	arg.tos = ip_hdr(skb)->tos;
886 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
887 	local_bh_disable();
888 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
889 	sock_net_set(ctl_sk, net);
890 	if (sk) {
891 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
892 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
893 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
894 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
895 		transmit_time = tcp_transmit_time(sk);
896 		xfrm_sk_clone_policy(ctl_sk, sk);
897 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
898 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
899 	} else {
900 		ctl_sk->sk_mark = 0;
901 		ctl_sk->sk_priority = 0;
902 	}
903 	ip_send_unicast_reply(ctl_sk,
904 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
905 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
906 			      &arg, arg.iov[0].iov_len,
907 			      transmit_time, txhash);
908 
909 	xfrm_sk_free_policy(ctl_sk);
910 	sock_net_set(ctl_sk, &init_net);
911 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
912 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
913 	local_bh_enable();
914 
915 #ifdef CONFIG_TCP_MD5SIG
916 out:
917 	rcu_read_unlock();
918 #endif
919 }
920 
921 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
922    outside socket context is ugly, certainly. What can I do?
923  */
924 
925 static void tcp_v4_send_ack(const struct sock *sk,
926 			    struct sk_buff *skb, u32 seq, u32 ack,
927 			    u32 win, u32 tsval, u32 tsecr, int oif,
928 			    struct tcp_key *key,
929 			    int reply_flags, u8 tos, u32 txhash)
930 {
931 	const struct tcphdr *th = tcp_hdr(skb);
932 	struct {
933 		struct tcphdr th;
934 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
935 	} rep;
936 	struct net *net = sock_net(sk);
937 	struct ip_reply_arg arg;
938 	struct sock *ctl_sk;
939 	u64 transmit_time;
940 
941 	memset(&rep.th, 0, sizeof(struct tcphdr));
942 	memset(&arg, 0, sizeof(arg));
943 
944 	arg.iov[0].iov_base = (unsigned char *)&rep;
945 	arg.iov[0].iov_len  = sizeof(rep.th);
946 	if (tsecr) {
947 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
948 				   (TCPOPT_TIMESTAMP << 8) |
949 				   TCPOLEN_TIMESTAMP);
950 		rep.opt[1] = htonl(tsval);
951 		rep.opt[2] = htonl(tsecr);
952 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
953 	}
954 
955 	/* Swap the send and the receive. */
956 	rep.th.dest    = th->source;
957 	rep.th.source  = th->dest;
958 	rep.th.doff    = arg.iov[0].iov_len / 4;
959 	rep.th.seq     = htonl(seq);
960 	rep.th.ack_seq = htonl(ack);
961 	rep.th.ack     = 1;
962 	rep.th.window  = htons(win);
963 
964 #ifdef CONFIG_TCP_MD5SIG
965 	if (tcp_key_is_md5(key)) {
966 		int offset = (tsecr) ? 3 : 0;
967 
968 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
969 					  (TCPOPT_NOP << 16) |
970 					  (TCPOPT_MD5SIG << 8) |
971 					  TCPOLEN_MD5SIG);
972 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
973 		rep.th.doff = arg.iov[0].iov_len/4;
974 
975 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
976 				    key->md5_key, ip_hdr(skb)->saddr,
977 				    ip_hdr(skb)->daddr, &rep.th);
978 	}
979 #endif
980 #ifdef CONFIG_TCP_AO
981 	if (tcp_key_is_ao(key)) {
982 		int offset = (tsecr) ? 3 : 0;
983 
984 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
985 					  (tcp_ao_len(key->ao_key) << 16) |
986 					  (key->ao_key->sndid << 8) |
987 					  key->rcv_next);
988 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
989 		rep.th.doff = arg.iov[0].iov_len / 4;
990 
991 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
992 				key->ao_key, key->traffic_key,
993 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
994 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
995 				&rep.th, key->sne);
996 	}
997 #endif
998 	arg.flags = reply_flags;
999 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1000 				      ip_hdr(skb)->saddr, /* XXX */
1001 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1002 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1003 	if (oif)
1004 		arg.bound_dev_if = oif;
1005 	arg.tos = tos;
1006 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1007 	local_bh_disable();
1008 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
1009 	sock_net_set(ctl_sk, net);
1010 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1011 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1012 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1013 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1014 	transmit_time = tcp_transmit_time(sk);
1015 	ip_send_unicast_reply(ctl_sk,
1016 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1017 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1018 			      &arg, arg.iov[0].iov_len,
1019 			      transmit_time, txhash);
1020 
1021 	sock_net_set(ctl_sk, &init_net);
1022 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1023 	local_bh_enable();
1024 }
1025 
1026 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1027 {
1028 	struct inet_timewait_sock *tw = inet_twsk(sk);
1029 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1030 	struct tcp_key key = {};
1031 #ifdef CONFIG_TCP_AO
1032 	struct tcp_ao_info *ao_info;
1033 
1034 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1035 		/* FIXME: the segment to-be-acked is not verified yet */
1036 		ao_info = rcu_dereference(tcptw->ao_info);
1037 		if (ao_info) {
1038 			const struct tcp_ao_hdr *aoh;
1039 
1040 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1041 				inet_twsk_put(tw);
1042 				return;
1043 			}
1044 
1045 			if (aoh)
1046 				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1047 		}
1048 	}
1049 	if (key.ao_key) {
1050 		struct tcp_ao_key *rnext_key;
1051 
1052 		key.traffic_key = snd_other_key(key.ao_key);
1053 		key.sne = READ_ONCE(ao_info->snd_sne);
1054 		rnext_key = READ_ONCE(ao_info->rnext_key);
1055 		key.rcv_next = rnext_key->rcvid;
1056 		key.type = TCP_KEY_AO;
1057 #else
1058 	if (0) {
1059 #endif
1060 #ifdef CONFIG_TCP_MD5SIG
1061 	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1062 		key.md5_key = tcp_twsk_md5_key(tcptw);
1063 		if (key.md5_key)
1064 			key.type = TCP_KEY_MD5;
1065 #endif
1066 	}
1067 
1068 	tcp_v4_send_ack(sk, skb,
1069 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1070 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1071 			tcp_tw_tsval(tcptw),
1072 			tcptw->tw_ts_recent,
1073 			tw->tw_bound_dev_if, &key,
1074 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1075 			tw->tw_tos,
1076 			tw->tw_txhash);
1077 
1078 	inet_twsk_put(tw);
1079 }
1080 
1081 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1082 				  struct request_sock *req)
1083 {
1084 	struct tcp_key key = {};
1085 
1086 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1087 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1088 	 */
1089 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1090 					     tcp_sk(sk)->snd_nxt;
1091 
1092 #ifdef CONFIG_TCP_AO
1093 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1094 	    tcp_rsk_used_ao(req)) {
1095 		const union tcp_md5_addr *addr;
1096 		const struct tcp_ao_hdr *aoh;
1097 		int l3index;
1098 
1099 		/* Invalid TCP option size or twice included auth */
1100 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1101 			return;
1102 		if (!aoh)
1103 			return;
1104 
1105 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1106 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1107 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1108 					      aoh->rnext_keyid, -1);
1109 		if (unlikely(!key.ao_key)) {
1110 			/* Send ACK with any matching MKT for the peer */
1111 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1112 			/* Matching key disappeared (user removed the key?)
1113 			 * let the handshake timeout.
1114 			 */
1115 			if (!key.ao_key) {
1116 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1117 						     addr,
1118 						     ntohs(tcp_hdr(skb)->source),
1119 						     &ip_hdr(skb)->daddr,
1120 						     ntohs(tcp_hdr(skb)->dest));
1121 				return;
1122 			}
1123 		}
1124 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1125 		if (!key.traffic_key)
1126 			return;
1127 
1128 		key.type = TCP_KEY_AO;
1129 		key.rcv_next = aoh->keyid;
1130 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1131 #else
1132 	if (0) {
1133 #endif
1134 #ifdef CONFIG_TCP_MD5SIG
1135 	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1136 		const union tcp_md5_addr *addr;
1137 		int l3index;
1138 
1139 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1140 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1141 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1142 		if (key.md5_key)
1143 			key.type = TCP_KEY_MD5;
1144 #endif
1145 	}
1146 
1147 	/* RFC 7323 2.3
1148 	 * The window field (SEG.WND) of every outgoing segment, with the
1149 	 * exception of <SYN> segments, MUST be right-shifted by
1150 	 * Rcv.Wind.Shift bits:
1151 	 */
1152 	tcp_v4_send_ack(sk, skb, seq,
1153 			tcp_rsk(req)->rcv_nxt,
1154 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
1155 			tcp_rsk_tsval(tcp_rsk(req)),
1156 			READ_ONCE(req->ts_recent),
1157 			0, &key,
1158 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1159 			ip_hdr(skb)->tos,
1160 			READ_ONCE(tcp_rsk(req)->txhash));
1161 	if (tcp_key_is_ao(&key))
1162 		kfree(key.traffic_key);
1163 }
1164 
1165 /*
1166  *	Send a SYN-ACK after having received a SYN.
1167  *	This still operates on a request_sock only, not on a big
1168  *	socket.
1169  */
1170 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1171 			      struct flowi *fl,
1172 			      struct request_sock *req,
1173 			      struct tcp_fastopen_cookie *foc,
1174 			      enum tcp_synack_type synack_type,
1175 			      struct sk_buff *syn_skb)
1176 {
1177 	const struct inet_request_sock *ireq = inet_rsk(req);
1178 	struct flowi4 fl4;
1179 	int err = -1;
1180 	struct sk_buff *skb;
1181 	u8 tos;
1182 
1183 	/* First, grab a route. */
1184 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1185 		return -1;
1186 
1187 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1188 
1189 	if (skb) {
1190 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1191 
1192 		tos = READ_ONCE(inet_sk(sk)->tos);
1193 
1194 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1195 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1196 			      (tos & INET_ECN_MASK);
1197 
1198 		if (!INET_ECN_is_capable(tos) &&
1199 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1200 			tos |= INET_ECN_ECT_0;
1201 
1202 		rcu_read_lock();
1203 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1204 					    ireq->ir_rmt_addr,
1205 					    rcu_dereference(ireq->ireq_opt),
1206 					    tos);
1207 		rcu_read_unlock();
1208 		err = net_xmit_eval(err);
1209 	}
1210 
1211 	return err;
1212 }
1213 
1214 /*
1215  *	IPv4 request_sock destructor.
1216  */
1217 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1218 {
1219 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1220 }
1221 
1222 #ifdef CONFIG_TCP_MD5SIG
1223 /*
1224  * RFC2385 MD5 checksumming requires a mapping of
1225  * IP address->MD5 Key.
1226  * We need to maintain these in the sk structure.
1227  */
1228 
1229 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1230 EXPORT_SYMBOL(tcp_md5_needed);
1231 
1232 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1233 {
1234 	if (!old)
1235 		return true;
1236 
1237 	/* l3index always overrides non-l3index */
1238 	if (old->l3index && new->l3index == 0)
1239 		return false;
1240 	if (old->l3index == 0 && new->l3index)
1241 		return true;
1242 
1243 	return old->prefixlen < new->prefixlen;
1244 }
1245 
1246 /* Find the Key structure for an address.  */
1247 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1248 					   const union tcp_md5_addr *addr,
1249 					   int family, bool any_l3index)
1250 {
1251 	const struct tcp_sock *tp = tcp_sk(sk);
1252 	struct tcp_md5sig_key *key;
1253 	const struct tcp_md5sig_info *md5sig;
1254 	__be32 mask;
1255 	struct tcp_md5sig_key *best_match = NULL;
1256 	bool match;
1257 
1258 	/* caller either holds rcu_read_lock() or socket lock */
1259 	md5sig = rcu_dereference_check(tp->md5sig_info,
1260 				       lockdep_sock_is_held(sk));
1261 	if (!md5sig)
1262 		return NULL;
1263 
1264 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1265 				 lockdep_sock_is_held(sk)) {
1266 		if (key->family != family)
1267 			continue;
1268 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1269 		    key->l3index != l3index)
1270 			continue;
1271 		if (family == AF_INET) {
1272 			mask = inet_make_mask(key->prefixlen);
1273 			match = (key->addr.a4.s_addr & mask) ==
1274 				(addr->a4.s_addr & mask);
1275 #if IS_ENABLED(CONFIG_IPV6)
1276 		} else if (family == AF_INET6) {
1277 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1278 						  key->prefixlen);
1279 #endif
1280 		} else {
1281 			match = false;
1282 		}
1283 
1284 		if (match && better_md5_match(best_match, key))
1285 			best_match = key;
1286 	}
1287 	return best_match;
1288 }
1289 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1290 
1291 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1292 						      const union tcp_md5_addr *addr,
1293 						      int family, u8 prefixlen,
1294 						      int l3index, u8 flags)
1295 {
1296 	const struct tcp_sock *tp = tcp_sk(sk);
1297 	struct tcp_md5sig_key *key;
1298 	unsigned int size = sizeof(struct in_addr);
1299 	const struct tcp_md5sig_info *md5sig;
1300 
1301 	/* caller either holds rcu_read_lock() or socket lock */
1302 	md5sig = rcu_dereference_check(tp->md5sig_info,
1303 				       lockdep_sock_is_held(sk));
1304 	if (!md5sig)
1305 		return NULL;
1306 #if IS_ENABLED(CONFIG_IPV6)
1307 	if (family == AF_INET6)
1308 		size = sizeof(struct in6_addr);
1309 #endif
1310 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1311 				 lockdep_sock_is_held(sk)) {
1312 		if (key->family != family)
1313 			continue;
1314 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1315 			continue;
1316 		if (key->l3index != l3index)
1317 			continue;
1318 		if (!memcmp(&key->addr, addr, size) &&
1319 		    key->prefixlen == prefixlen)
1320 			return key;
1321 	}
1322 	return NULL;
1323 }
1324 
1325 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1326 					 const struct sock *addr_sk)
1327 {
1328 	const union tcp_md5_addr *addr;
1329 	int l3index;
1330 
1331 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1332 						 addr_sk->sk_bound_dev_if);
1333 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1334 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1335 }
1336 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1337 
1338 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1339 {
1340 	struct tcp_sock *tp = tcp_sk(sk);
1341 	struct tcp_md5sig_info *md5sig;
1342 
1343 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1344 	if (!md5sig)
1345 		return -ENOMEM;
1346 
1347 	sk_gso_disable(sk);
1348 	INIT_HLIST_HEAD(&md5sig->head);
1349 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1350 	return 0;
1351 }
1352 
1353 /* This can be called on a newly created socket, from other files */
1354 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1355 			    int family, u8 prefixlen, int l3index, u8 flags,
1356 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1357 {
1358 	/* Add Key to the list */
1359 	struct tcp_md5sig_key *key;
1360 	struct tcp_sock *tp = tcp_sk(sk);
1361 	struct tcp_md5sig_info *md5sig;
1362 
1363 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1364 	if (key) {
1365 		/* Pre-existing entry - just update that one.
1366 		 * Note that the key might be used concurrently.
1367 		 * data_race() is telling kcsan that we do not care of
1368 		 * key mismatches, since changing MD5 key on live flows
1369 		 * can lead to packet drops.
1370 		 */
1371 		data_race(memcpy(key->key, newkey, newkeylen));
1372 
1373 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1374 		 * Also note that a reader could catch new key->keylen value
1375 		 * but old key->key[], this is the reason we use __GFP_ZERO
1376 		 * at sock_kmalloc() time below these lines.
1377 		 */
1378 		WRITE_ONCE(key->keylen, newkeylen);
1379 
1380 		return 0;
1381 	}
1382 
1383 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1384 					   lockdep_sock_is_held(sk));
1385 
1386 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1387 	if (!key)
1388 		return -ENOMEM;
1389 
1390 	memcpy(key->key, newkey, newkeylen);
1391 	key->keylen = newkeylen;
1392 	key->family = family;
1393 	key->prefixlen = prefixlen;
1394 	key->l3index = l3index;
1395 	key->flags = flags;
1396 	memcpy(&key->addr, addr,
1397 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1398 								 sizeof(struct in_addr));
1399 	hlist_add_head_rcu(&key->node, &md5sig->head);
1400 	return 0;
1401 }
1402 
1403 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1404 		   int family, u8 prefixlen, int l3index, u8 flags,
1405 		   const u8 *newkey, u8 newkeylen)
1406 {
1407 	struct tcp_sock *tp = tcp_sk(sk);
1408 
1409 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1410 		if (tcp_md5_alloc_sigpool())
1411 			return -ENOMEM;
1412 
1413 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1414 			tcp_md5_release_sigpool();
1415 			return -ENOMEM;
1416 		}
1417 
1418 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1419 			struct tcp_md5sig_info *md5sig;
1420 
1421 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1422 			rcu_assign_pointer(tp->md5sig_info, NULL);
1423 			kfree_rcu(md5sig, rcu);
1424 			tcp_md5_release_sigpool();
1425 			return -EUSERS;
1426 		}
1427 	}
1428 
1429 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1430 				newkey, newkeylen, GFP_KERNEL);
1431 }
1432 EXPORT_SYMBOL(tcp_md5_do_add);
1433 
1434 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1435 		     int family, u8 prefixlen, int l3index,
1436 		     struct tcp_md5sig_key *key)
1437 {
1438 	struct tcp_sock *tp = tcp_sk(sk);
1439 
1440 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1441 		tcp_md5_add_sigpool();
1442 
1443 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1444 			tcp_md5_release_sigpool();
1445 			return -ENOMEM;
1446 		}
1447 
1448 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1449 			struct tcp_md5sig_info *md5sig;
1450 
1451 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1452 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1453 			rcu_assign_pointer(tp->md5sig_info, NULL);
1454 			kfree_rcu(md5sig, rcu);
1455 			tcp_md5_release_sigpool();
1456 			return -EUSERS;
1457 		}
1458 	}
1459 
1460 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1461 				key->flags, key->key, key->keylen,
1462 				sk_gfp_mask(sk, GFP_ATOMIC));
1463 }
1464 EXPORT_SYMBOL(tcp_md5_key_copy);
1465 
1466 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1467 		   u8 prefixlen, int l3index, u8 flags)
1468 {
1469 	struct tcp_md5sig_key *key;
1470 
1471 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1472 	if (!key)
1473 		return -ENOENT;
1474 	hlist_del_rcu(&key->node);
1475 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1476 	kfree_rcu(key, rcu);
1477 	return 0;
1478 }
1479 EXPORT_SYMBOL(tcp_md5_do_del);
1480 
1481 void tcp_clear_md5_list(struct sock *sk)
1482 {
1483 	struct tcp_sock *tp = tcp_sk(sk);
1484 	struct tcp_md5sig_key *key;
1485 	struct hlist_node *n;
1486 	struct tcp_md5sig_info *md5sig;
1487 
1488 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1489 
1490 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1491 		hlist_del_rcu(&key->node);
1492 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1493 		kfree_rcu(key, rcu);
1494 	}
1495 }
1496 
1497 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1498 				 sockptr_t optval, int optlen)
1499 {
1500 	struct tcp_md5sig cmd;
1501 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1502 	const union tcp_md5_addr *addr;
1503 	u8 prefixlen = 32;
1504 	int l3index = 0;
1505 	bool l3flag;
1506 	u8 flags;
1507 
1508 	if (optlen < sizeof(cmd))
1509 		return -EINVAL;
1510 
1511 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1512 		return -EFAULT;
1513 
1514 	if (sin->sin_family != AF_INET)
1515 		return -EINVAL;
1516 
1517 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1518 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1519 
1520 	if (optname == TCP_MD5SIG_EXT &&
1521 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1522 		prefixlen = cmd.tcpm_prefixlen;
1523 		if (prefixlen > 32)
1524 			return -EINVAL;
1525 	}
1526 
1527 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1528 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1529 		struct net_device *dev;
1530 
1531 		rcu_read_lock();
1532 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1533 		if (dev && netif_is_l3_master(dev))
1534 			l3index = dev->ifindex;
1535 
1536 		rcu_read_unlock();
1537 
1538 		/* ok to reference set/not set outside of rcu;
1539 		 * right now device MUST be an L3 master
1540 		 */
1541 		if (!dev || !l3index)
1542 			return -EINVAL;
1543 	}
1544 
1545 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1546 
1547 	if (!cmd.tcpm_keylen)
1548 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1549 
1550 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1551 		return -EINVAL;
1552 
1553 	/* Don't allow keys for peers that have a matching TCP-AO key.
1554 	 * See the comment in tcp_ao_add_cmd()
1555 	 */
1556 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1557 		return -EKEYREJECTED;
1558 
1559 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1560 			      cmd.tcpm_key, cmd.tcpm_keylen);
1561 }
1562 
1563 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1564 				   __be32 daddr, __be32 saddr,
1565 				   const struct tcphdr *th, int nbytes)
1566 {
1567 	struct tcp4_pseudohdr *bp;
1568 	struct scatterlist sg;
1569 	struct tcphdr *_th;
1570 
1571 	bp = hp->scratch;
1572 	bp->saddr = saddr;
1573 	bp->daddr = daddr;
1574 	bp->pad = 0;
1575 	bp->protocol = IPPROTO_TCP;
1576 	bp->len = cpu_to_be16(nbytes);
1577 
1578 	_th = (struct tcphdr *)(bp + 1);
1579 	memcpy(_th, th, sizeof(*th));
1580 	_th->check = 0;
1581 
1582 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1583 	ahash_request_set_crypt(hp->req, &sg, NULL,
1584 				sizeof(*bp) + sizeof(*th));
1585 	return crypto_ahash_update(hp->req);
1586 }
1587 
1588 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1589 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1590 {
1591 	struct tcp_sigpool hp;
1592 
1593 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1594 		goto clear_hash_nostart;
1595 
1596 	if (crypto_ahash_init(hp.req))
1597 		goto clear_hash;
1598 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1599 		goto clear_hash;
1600 	if (tcp_md5_hash_key(&hp, key))
1601 		goto clear_hash;
1602 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1603 	if (crypto_ahash_final(hp.req))
1604 		goto clear_hash;
1605 
1606 	tcp_sigpool_end(&hp);
1607 	return 0;
1608 
1609 clear_hash:
1610 	tcp_sigpool_end(&hp);
1611 clear_hash_nostart:
1612 	memset(md5_hash, 0, 16);
1613 	return 1;
1614 }
1615 
1616 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1617 			const struct sock *sk,
1618 			const struct sk_buff *skb)
1619 {
1620 	const struct tcphdr *th = tcp_hdr(skb);
1621 	struct tcp_sigpool hp;
1622 	__be32 saddr, daddr;
1623 
1624 	if (sk) { /* valid for establish/request sockets */
1625 		saddr = sk->sk_rcv_saddr;
1626 		daddr = sk->sk_daddr;
1627 	} else {
1628 		const struct iphdr *iph = ip_hdr(skb);
1629 		saddr = iph->saddr;
1630 		daddr = iph->daddr;
1631 	}
1632 
1633 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1634 		goto clear_hash_nostart;
1635 
1636 	if (crypto_ahash_init(hp.req))
1637 		goto clear_hash;
1638 
1639 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1640 		goto clear_hash;
1641 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1642 		goto clear_hash;
1643 	if (tcp_md5_hash_key(&hp, key))
1644 		goto clear_hash;
1645 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1646 	if (crypto_ahash_final(hp.req))
1647 		goto clear_hash;
1648 
1649 	tcp_sigpool_end(&hp);
1650 	return 0;
1651 
1652 clear_hash:
1653 	tcp_sigpool_end(&hp);
1654 clear_hash_nostart:
1655 	memset(md5_hash, 0, 16);
1656 	return 1;
1657 }
1658 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1659 
1660 #endif
1661 
1662 static void tcp_v4_init_req(struct request_sock *req,
1663 			    const struct sock *sk_listener,
1664 			    struct sk_buff *skb)
1665 {
1666 	struct inet_request_sock *ireq = inet_rsk(req);
1667 	struct net *net = sock_net(sk_listener);
1668 
1669 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1670 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1671 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1672 }
1673 
1674 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1675 					  struct sk_buff *skb,
1676 					  struct flowi *fl,
1677 					  struct request_sock *req,
1678 					  u32 tw_isn)
1679 {
1680 	tcp_v4_init_req(req, sk, skb);
1681 
1682 	if (security_inet_conn_request(sk, skb, req))
1683 		return NULL;
1684 
1685 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1686 }
1687 
1688 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1689 	.family		=	PF_INET,
1690 	.obj_size	=	sizeof(struct tcp_request_sock),
1691 	.rtx_syn_ack	=	tcp_rtx_synack,
1692 	.send_ack	=	tcp_v4_reqsk_send_ack,
1693 	.destructor	=	tcp_v4_reqsk_destructor,
1694 	.send_reset	=	tcp_v4_send_reset,
1695 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1696 };
1697 
1698 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1699 	.mss_clamp	=	TCP_MSS_DEFAULT,
1700 #ifdef CONFIG_TCP_MD5SIG
1701 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1702 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1703 #endif
1704 #ifdef CONFIG_TCP_AO
1705 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1706 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1707 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1708 #endif
1709 #ifdef CONFIG_SYN_COOKIES
1710 	.cookie_init_seq =	cookie_v4_init_sequence,
1711 #endif
1712 	.route_req	=	tcp_v4_route_req,
1713 	.init_seq	=	tcp_v4_init_seq,
1714 	.init_ts_off	=	tcp_v4_init_ts_off,
1715 	.send_synack	=	tcp_v4_send_synack,
1716 };
1717 
1718 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1719 {
1720 	/* Never answer to SYNs send to broadcast or multicast */
1721 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1722 		goto drop;
1723 
1724 	return tcp_conn_request(&tcp_request_sock_ops,
1725 				&tcp_request_sock_ipv4_ops, sk, skb);
1726 
1727 drop:
1728 	tcp_listendrop(sk);
1729 	return 0;
1730 }
1731 EXPORT_SYMBOL(tcp_v4_conn_request);
1732 
1733 
1734 /*
1735  * The three way handshake has completed - we got a valid synack -
1736  * now create the new socket.
1737  */
1738 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1739 				  struct request_sock *req,
1740 				  struct dst_entry *dst,
1741 				  struct request_sock *req_unhash,
1742 				  bool *own_req)
1743 {
1744 	struct inet_request_sock *ireq;
1745 	bool found_dup_sk = false;
1746 	struct inet_sock *newinet;
1747 	struct tcp_sock *newtp;
1748 	struct sock *newsk;
1749 #ifdef CONFIG_TCP_MD5SIG
1750 	const union tcp_md5_addr *addr;
1751 	struct tcp_md5sig_key *key;
1752 	int l3index;
1753 #endif
1754 	struct ip_options_rcu *inet_opt;
1755 
1756 	if (sk_acceptq_is_full(sk))
1757 		goto exit_overflow;
1758 
1759 	newsk = tcp_create_openreq_child(sk, req, skb);
1760 	if (!newsk)
1761 		goto exit_nonewsk;
1762 
1763 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1764 	inet_sk_rx_dst_set(newsk, skb);
1765 
1766 	newtp		      = tcp_sk(newsk);
1767 	newinet		      = inet_sk(newsk);
1768 	ireq		      = inet_rsk(req);
1769 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1770 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1771 	newsk->sk_bound_dev_if = ireq->ir_iif;
1772 	newinet->inet_saddr   = ireq->ir_loc_addr;
1773 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1774 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1775 	newinet->mc_index     = inet_iif(skb);
1776 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1777 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1778 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1779 	if (inet_opt)
1780 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1781 	atomic_set(&newinet->inet_id, get_random_u16());
1782 
1783 	/* Set ToS of the new socket based upon the value of incoming SYN.
1784 	 * ECT bits are set later in tcp_init_transfer().
1785 	 */
1786 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1787 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1788 
1789 	if (!dst) {
1790 		dst = inet_csk_route_child_sock(sk, newsk, req);
1791 		if (!dst)
1792 			goto put_and_exit;
1793 	} else {
1794 		/* syncookie case : see end of cookie_v4_check() */
1795 	}
1796 	sk_setup_caps(newsk, dst);
1797 
1798 	tcp_ca_openreq_child(newsk, dst);
1799 
1800 	tcp_sync_mss(newsk, dst_mtu(dst));
1801 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1802 
1803 	tcp_initialize_rcv_mss(newsk);
1804 
1805 #ifdef CONFIG_TCP_MD5SIG
1806 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1807 	/* Copy over the MD5 key from the original socket */
1808 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1809 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1810 	if (key && !tcp_rsk_used_ao(req)) {
1811 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1812 			goto put_and_exit;
1813 		sk_gso_disable(newsk);
1814 	}
1815 #endif
1816 #ifdef CONFIG_TCP_AO
1817 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1818 		goto put_and_exit; /* OOM, release back memory */
1819 #endif
1820 
1821 	if (__inet_inherit_port(sk, newsk) < 0)
1822 		goto put_and_exit;
1823 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1824 				       &found_dup_sk);
1825 	if (likely(*own_req)) {
1826 		tcp_move_syn(newtp, req);
1827 		ireq->ireq_opt = NULL;
1828 	} else {
1829 		newinet->inet_opt = NULL;
1830 
1831 		if (!req_unhash && found_dup_sk) {
1832 			/* This code path should only be executed in the
1833 			 * syncookie case only
1834 			 */
1835 			bh_unlock_sock(newsk);
1836 			sock_put(newsk);
1837 			newsk = NULL;
1838 		}
1839 	}
1840 	return newsk;
1841 
1842 exit_overflow:
1843 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1844 exit_nonewsk:
1845 	dst_release(dst);
1846 exit:
1847 	tcp_listendrop(sk);
1848 	return NULL;
1849 put_and_exit:
1850 	newinet->inet_opt = NULL;
1851 	inet_csk_prepare_forced_close(newsk);
1852 	tcp_done(newsk);
1853 	goto exit;
1854 }
1855 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1856 
1857 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1858 {
1859 #ifdef CONFIG_SYN_COOKIES
1860 	const struct tcphdr *th = tcp_hdr(skb);
1861 
1862 	if (!th->syn)
1863 		sk = cookie_v4_check(sk, skb);
1864 #endif
1865 	return sk;
1866 }
1867 
1868 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1869 			 struct tcphdr *th, u32 *cookie)
1870 {
1871 	u16 mss = 0;
1872 #ifdef CONFIG_SYN_COOKIES
1873 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1874 				    &tcp_request_sock_ipv4_ops, sk, th);
1875 	if (mss) {
1876 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1877 		tcp_synq_overflow(sk);
1878 	}
1879 #endif
1880 	return mss;
1881 }
1882 
1883 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1884 							   u32));
1885 /* The socket must have it's spinlock held when we get
1886  * here, unless it is a TCP_LISTEN socket.
1887  *
1888  * We have a potential double-lock case here, so even when
1889  * doing backlog processing we use the BH locking scheme.
1890  * This is because we cannot sleep with the original spinlock
1891  * held.
1892  */
1893 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1894 {
1895 	enum skb_drop_reason reason;
1896 	struct sock *rsk;
1897 
1898 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1899 		struct dst_entry *dst;
1900 
1901 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1902 						lockdep_sock_is_held(sk));
1903 
1904 		sock_rps_save_rxhash(sk, skb);
1905 		sk_mark_napi_id(sk, skb);
1906 		if (dst) {
1907 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1908 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1909 					     dst, 0)) {
1910 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1911 				dst_release(dst);
1912 			}
1913 		}
1914 		tcp_rcv_established(sk, skb);
1915 		return 0;
1916 	}
1917 
1918 	if (tcp_checksum_complete(skb))
1919 		goto csum_err;
1920 
1921 	if (sk->sk_state == TCP_LISTEN) {
1922 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1923 
1924 		if (!nsk)
1925 			return 0;
1926 		if (nsk != sk) {
1927 			reason = tcp_child_process(sk, nsk, skb);
1928 			if (reason) {
1929 				rsk = nsk;
1930 				goto reset;
1931 			}
1932 			return 0;
1933 		}
1934 	} else
1935 		sock_rps_save_rxhash(sk, skb);
1936 
1937 	reason = tcp_rcv_state_process(sk, skb);
1938 	if (reason) {
1939 		rsk = sk;
1940 		goto reset;
1941 	}
1942 	return 0;
1943 
1944 reset:
1945 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1946 discard:
1947 	kfree_skb_reason(skb, reason);
1948 	/* Be careful here. If this function gets more complicated and
1949 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1950 	 * might be destroyed here. This current version compiles correctly,
1951 	 * but you have been warned.
1952 	 */
1953 	return 0;
1954 
1955 csum_err:
1956 	reason = SKB_DROP_REASON_TCP_CSUM;
1957 	trace_tcp_bad_csum(skb);
1958 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1959 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1960 	goto discard;
1961 }
1962 EXPORT_SYMBOL(tcp_v4_do_rcv);
1963 
1964 int tcp_v4_early_demux(struct sk_buff *skb)
1965 {
1966 	struct net *net = dev_net(skb->dev);
1967 	const struct iphdr *iph;
1968 	const struct tcphdr *th;
1969 	struct sock *sk;
1970 
1971 	if (skb->pkt_type != PACKET_HOST)
1972 		return 0;
1973 
1974 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1975 		return 0;
1976 
1977 	iph = ip_hdr(skb);
1978 	th = tcp_hdr(skb);
1979 
1980 	if (th->doff < sizeof(struct tcphdr) / 4)
1981 		return 0;
1982 
1983 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1984 				       iph->saddr, th->source,
1985 				       iph->daddr, ntohs(th->dest),
1986 				       skb->skb_iif, inet_sdif(skb));
1987 	if (sk) {
1988 		skb->sk = sk;
1989 		skb->destructor = sock_edemux;
1990 		if (sk_fullsock(sk)) {
1991 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1992 
1993 			if (dst)
1994 				dst = dst_check(dst, 0);
1995 			if (dst &&
1996 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1997 				skb_dst_set_noref(skb, dst);
1998 		}
1999 	}
2000 	return 0;
2001 }
2002 
2003 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2004 		     enum skb_drop_reason *reason)
2005 {
2006 	u32 tail_gso_size, tail_gso_segs;
2007 	struct skb_shared_info *shinfo;
2008 	const struct tcphdr *th;
2009 	struct tcphdr *thtail;
2010 	struct sk_buff *tail;
2011 	unsigned int hdrlen;
2012 	bool fragstolen;
2013 	u32 gso_segs;
2014 	u32 gso_size;
2015 	u64 limit;
2016 	int delta;
2017 
2018 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2019 	 * we can fix skb->truesize to its real value to avoid future drops.
2020 	 * This is valid because skb is not yet charged to the socket.
2021 	 * It has been noticed pure SACK packets were sometimes dropped
2022 	 * (if cooked by drivers without copybreak feature).
2023 	 */
2024 	skb_condense(skb);
2025 
2026 	skb_dst_drop(skb);
2027 
2028 	if (unlikely(tcp_checksum_complete(skb))) {
2029 		bh_unlock_sock(sk);
2030 		trace_tcp_bad_csum(skb);
2031 		*reason = SKB_DROP_REASON_TCP_CSUM;
2032 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2033 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2034 		return true;
2035 	}
2036 
2037 	/* Attempt coalescing to last skb in backlog, even if we are
2038 	 * above the limits.
2039 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2040 	 */
2041 	th = (const struct tcphdr *)skb->data;
2042 	hdrlen = th->doff * 4;
2043 
2044 	tail = sk->sk_backlog.tail;
2045 	if (!tail)
2046 		goto no_coalesce;
2047 	thtail = (struct tcphdr *)tail->data;
2048 
2049 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2050 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2051 	    ((TCP_SKB_CB(tail)->tcp_flags |
2052 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2053 	    !((TCP_SKB_CB(tail)->tcp_flags &
2054 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2055 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2056 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2057 	    !mptcp_skb_can_collapse(tail, skb) ||
2058 	    skb_cmp_decrypted(tail, skb) ||
2059 	    thtail->doff != th->doff ||
2060 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2061 		goto no_coalesce;
2062 
2063 	__skb_pull(skb, hdrlen);
2064 
2065 	shinfo = skb_shinfo(skb);
2066 	gso_size = shinfo->gso_size ?: skb->len;
2067 	gso_segs = shinfo->gso_segs ?: 1;
2068 
2069 	shinfo = skb_shinfo(tail);
2070 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2071 	tail_gso_segs = shinfo->gso_segs ?: 1;
2072 
2073 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2074 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2075 
2076 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2077 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2078 			thtail->window = th->window;
2079 		}
2080 
2081 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2082 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2083 		 * is not entered if we append a packet with a FIN.
2084 		 * SYN, RST, URG are not present.
2085 		 * ACK is set on both packets.
2086 		 * PSH : we do not really care in TCP stack,
2087 		 *       at least for 'GRO' packets.
2088 		 */
2089 		thtail->fin |= th->fin;
2090 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2091 
2092 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2093 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2094 			tail->tstamp = skb->tstamp;
2095 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2096 		}
2097 
2098 		/* Not as strict as GRO. We only need to carry mss max value */
2099 		shinfo->gso_size = max(gso_size, tail_gso_size);
2100 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2101 
2102 		sk->sk_backlog.len += delta;
2103 		__NET_INC_STATS(sock_net(sk),
2104 				LINUX_MIB_TCPBACKLOGCOALESCE);
2105 		kfree_skb_partial(skb, fragstolen);
2106 		return false;
2107 	}
2108 	__skb_push(skb, hdrlen);
2109 
2110 no_coalesce:
2111 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2112 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2113 	 * sk_rcvbuf in normal conditions.
2114 	 */
2115 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2116 
2117 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2118 
2119 	/* Only socket owner can try to collapse/prune rx queues
2120 	 * to reduce memory overhead, so add a little headroom here.
2121 	 * Few sockets backlog are possibly concurrently non empty.
2122 	 */
2123 	limit += 64 * 1024;
2124 
2125 	limit = min_t(u64, limit, UINT_MAX);
2126 
2127 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2128 		bh_unlock_sock(sk);
2129 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2130 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2131 		return true;
2132 	}
2133 	return false;
2134 }
2135 EXPORT_SYMBOL(tcp_add_backlog);
2136 
2137 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2138 {
2139 	struct tcphdr *th = (struct tcphdr *)skb->data;
2140 
2141 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2142 }
2143 EXPORT_SYMBOL(tcp_filter);
2144 
2145 static void tcp_v4_restore_cb(struct sk_buff *skb)
2146 {
2147 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2148 		sizeof(struct inet_skb_parm));
2149 }
2150 
2151 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2152 			   const struct tcphdr *th)
2153 {
2154 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2155 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2156 	 */
2157 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2158 		sizeof(struct inet_skb_parm));
2159 	barrier();
2160 
2161 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2162 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2163 				    skb->len - th->doff * 4);
2164 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2165 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2166 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2167 	TCP_SKB_CB(skb)->sacked	 = 0;
2168 	TCP_SKB_CB(skb)->has_rxtstamp =
2169 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2170 }
2171 
2172 /*
2173  *	From tcp_input.c
2174  */
2175 
2176 int tcp_v4_rcv(struct sk_buff *skb)
2177 {
2178 	struct net *net = dev_net(skb->dev);
2179 	enum skb_drop_reason drop_reason;
2180 	int sdif = inet_sdif(skb);
2181 	int dif = inet_iif(skb);
2182 	const struct iphdr *iph;
2183 	const struct tcphdr *th;
2184 	bool refcounted;
2185 	struct sock *sk;
2186 	int ret;
2187 	u32 isn;
2188 
2189 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2190 	if (skb->pkt_type != PACKET_HOST)
2191 		goto discard_it;
2192 
2193 	/* Count it even if it's bad */
2194 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2195 
2196 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2197 		goto discard_it;
2198 
2199 	th = (const struct tcphdr *)skb->data;
2200 
2201 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2202 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2203 		goto bad_packet;
2204 	}
2205 	if (!pskb_may_pull(skb, th->doff * 4))
2206 		goto discard_it;
2207 
2208 	/* An explanation is required here, I think.
2209 	 * Packet length and doff are validated by header prediction,
2210 	 * provided case of th->doff==0 is eliminated.
2211 	 * So, we defer the checks. */
2212 
2213 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2214 		goto csum_error;
2215 
2216 	th = (const struct tcphdr *)skb->data;
2217 	iph = ip_hdr(skb);
2218 lookup:
2219 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2220 			       skb, __tcp_hdrlen(th), th->source,
2221 			       th->dest, sdif, &refcounted);
2222 	if (!sk)
2223 		goto no_tcp_socket;
2224 
2225 	if (sk->sk_state == TCP_TIME_WAIT)
2226 		goto do_time_wait;
2227 
2228 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2229 		struct request_sock *req = inet_reqsk(sk);
2230 		bool req_stolen = false;
2231 		struct sock *nsk;
2232 
2233 		sk = req->rsk_listener;
2234 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2235 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2236 		else
2237 			drop_reason = tcp_inbound_hash(sk, req, skb,
2238 						       &iph->saddr, &iph->daddr,
2239 						       AF_INET, dif, sdif);
2240 		if (unlikely(drop_reason)) {
2241 			sk_drops_add(sk, skb);
2242 			reqsk_put(req);
2243 			goto discard_it;
2244 		}
2245 		if (tcp_checksum_complete(skb)) {
2246 			reqsk_put(req);
2247 			goto csum_error;
2248 		}
2249 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2250 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2251 			if (!nsk) {
2252 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2253 				goto lookup;
2254 			}
2255 			sk = nsk;
2256 			/* reuseport_migrate_sock() has already held one sk_refcnt
2257 			 * before returning.
2258 			 */
2259 		} else {
2260 			/* We own a reference on the listener, increase it again
2261 			 * as we might lose it too soon.
2262 			 */
2263 			sock_hold(sk);
2264 		}
2265 		refcounted = true;
2266 		nsk = NULL;
2267 		if (!tcp_filter(sk, skb)) {
2268 			th = (const struct tcphdr *)skb->data;
2269 			iph = ip_hdr(skb);
2270 			tcp_v4_fill_cb(skb, iph, th);
2271 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2272 		} else {
2273 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2274 		}
2275 		if (!nsk) {
2276 			reqsk_put(req);
2277 			if (req_stolen) {
2278 				/* Another cpu got exclusive access to req
2279 				 * and created a full blown socket.
2280 				 * Try to feed this packet to this socket
2281 				 * instead of discarding it.
2282 				 */
2283 				tcp_v4_restore_cb(skb);
2284 				sock_put(sk);
2285 				goto lookup;
2286 			}
2287 			goto discard_and_relse;
2288 		}
2289 		nf_reset_ct(skb);
2290 		if (nsk == sk) {
2291 			reqsk_put(req);
2292 			tcp_v4_restore_cb(skb);
2293 		} else {
2294 			drop_reason = tcp_child_process(sk, nsk, skb);
2295 			if (drop_reason) {
2296 				enum sk_rst_reason rst_reason;
2297 
2298 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2299 				tcp_v4_send_reset(nsk, skb, rst_reason);
2300 				goto discard_and_relse;
2301 			}
2302 			sock_put(sk);
2303 			return 0;
2304 		}
2305 	}
2306 
2307 process:
2308 	if (static_branch_unlikely(&ip4_min_ttl)) {
2309 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2310 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2311 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2312 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2313 			goto discard_and_relse;
2314 		}
2315 	}
2316 
2317 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2318 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2319 		goto discard_and_relse;
2320 	}
2321 
2322 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2323 				       AF_INET, dif, sdif);
2324 	if (drop_reason)
2325 		goto discard_and_relse;
2326 
2327 	nf_reset_ct(skb);
2328 
2329 	if (tcp_filter(sk, skb)) {
2330 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2331 		goto discard_and_relse;
2332 	}
2333 	th = (const struct tcphdr *)skb->data;
2334 	iph = ip_hdr(skb);
2335 	tcp_v4_fill_cb(skb, iph, th);
2336 
2337 	skb->dev = NULL;
2338 
2339 	if (sk->sk_state == TCP_LISTEN) {
2340 		ret = tcp_v4_do_rcv(sk, skb);
2341 		goto put_and_return;
2342 	}
2343 
2344 	sk_incoming_cpu_update(sk);
2345 
2346 	bh_lock_sock_nested(sk);
2347 	tcp_segs_in(tcp_sk(sk), skb);
2348 	ret = 0;
2349 	if (!sock_owned_by_user(sk)) {
2350 		ret = tcp_v4_do_rcv(sk, skb);
2351 	} else {
2352 		if (tcp_add_backlog(sk, skb, &drop_reason))
2353 			goto discard_and_relse;
2354 	}
2355 	bh_unlock_sock(sk);
2356 
2357 put_and_return:
2358 	if (refcounted)
2359 		sock_put(sk);
2360 
2361 	return ret;
2362 
2363 no_tcp_socket:
2364 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2365 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2366 		goto discard_it;
2367 
2368 	tcp_v4_fill_cb(skb, iph, th);
2369 
2370 	if (tcp_checksum_complete(skb)) {
2371 csum_error:
2372 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2373 		trace_tcp_bad_csum(skb);
2374 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2375 bad_packet:
2376 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2377 	} else {
2378 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2379 	}
2380 
2381 discard_it:
2382 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2383 	/* Discard frame. */
2384 	kfree_skb_reason(skb, drop_reason);
2385 	return 0;
2386 
2387 discard_and_relse:
2388 	sk_drops_add(sk, skb);
2389 	if (refcounted)
2390 		sock_put(sk);
2391 	goto discard_it;
2392 
2393 do_time_wait:
2394 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2395 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2396 		inet_twsk_put(inet_twsk(sk));
2397 		goto discard_it;
2398 	}
2399 
2400 	tcp_v4_fill_cb(skb, iph, th);
2401 
2402 	if (tcp_checksum_complete(skb)) {
2403 		inet_twsk_put(inet_twsk(sk));
2404 		goto csum_error;
2405 	}
2406 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2407 	case TCP_TW_SYN: {
2408 		struct sock *sk2 = inet_lookup_listener(net,
2409 							net->ipv4.tcp_death_row.hashinfo,
2410 							skb, __tcp_hdrlen(th),
2411 							iph->saddr, th->source,
2412 							iph->daddr, th->dest,
2413 							inet_iif(skb),
2414 							sdif);
2415 		if (sk2) {
2416 			inet_twsk_deschedule_put(inet_twsk(sk));
2417 			sk = sk2;
2418 			tcp_v4_restore_cb(skb);
2419 			refcounted = false;
2420 			__this_cpu_write(tcp_tw_isn, isn);
2421 			goto process;
2422 		}
2423 	}
2424 		/* to ACK */
2425 		fallthrough;
2426 	case TCP_TW_ACK:
2427 		tcp_v4_timewait_ack(sk, skb);
2428 		break;
2429 	case TCP_TW_RST:
2430 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2431 		inet_twsk_deschedule_put(inet_twsk(sk));
2432 		goto discard_it;
2433 	case TCP_TW_SUCCESS:;
2434 	}
2435 	goto discard_it;
2436 }
2437 
2438 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2439 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2440 	.twsk_destructor= tcp_twsk_destructor,
2441 };
2442 
2443 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2444 {
2445 	struct dst_entry *dst = skb_dst(skb);
2446 
2447 	if (dst && dst_hold_safe(dst)) {
2448 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2449 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2450 	}
2451 }
2452 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2453 
2454 const struct inet_connection_sock_af_ops ipv4_specific = {
2455 	.queue_xmit	   = ip_queue_xmit,
2456 	.send_check	   = tcp_v4_send_check,
2457 	.rebuild_header	   = inet_sk_rebuild_header,
2458 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2459 	.conn_request	   = tcp_v4_conn_request,
2460 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2461 	.net_header_len	   = sizeof(struct iphdr),
2462 	.setsockopt	   = ip_setsockopt,
2463 	.getsockopt	   = ip_getsockopt,
2464 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2465 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2466 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2467 };
2468 EXPORT_SYMBOL(ipv4_specific);
2469 
2470 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2471 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2472 #ifdef CONFIG_TCP_MD5SIG
2473 	.md5_lookup		= tcp_v4_md5_lookup,
2474 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2475 	.md5_parse		= tcp_v4_parse_md5_keys,
2476 #endif
2477 #ifdef CONFIG_TCP_AO
2478 	.ao_lookup		= tcp_v4_ao_lookup,
2479 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2480 	.ao_parse		= tcp_v4_parse_ao,
2481 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2482 #endif
2483 };
2484 #endif
2485 
2486 /* NOTE: A lot of things set to zero explicitly by call to
2487  *       sk_alloc() so need not be done here.
2488  */
2489 static int tcp_v4_init_sock(struct sock *sk)
2490 {
2491 	struct inet_connection_sock *icsk = inet_csk(sk);
2492 
2493 	tcp_init_sock(sk);
2494 
2495 	icsk->icsk_af_ops = &ipv4_specific;
2496 
2497 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2498 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2499 #endif
2500 
2501 	return 0;
2502 }
2503 
2504 #ifdef CONFIG_TCP_MD5SIG
2505 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2506 {
2507 	struct tcp_md5sig_info *md5sig;
2508 
2509 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2510 	kfree(md5sig);
2511 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2512 	tcp_md5_release_sigpool();
2513 }
2514 #endif
2515 
2516 void tcp_v4_destroy_sock(struct sock *sk)
2517 {
2518 	struct tcp_sock *tp = tcp_sk(sk);
2519 
2520 	trace_tcp_destroy_sock(sk);
2521 
2522 	tcp_clear_xmit_timers(sk);
2523 
2524 	tcp_cleanup_congestion_control(sk);
2525 
2526 	tcp_cleanup_ulp(sk);
2527 
2528 	/* Cleanup up the write buffer. */
2529 	tcp_write_queue_purge(sk);
2530 
2531 	/* Check if we want to disable active TFO */
2532 	tcp_fastopen_active_disable_ofo_check(sk);
2533 
2534 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2535 	skb_rbtree_purge(&tp->out_of_order_queue);
2536 
2537 #ifdef CONFIG_TCP_MD5SIG
2538 	/* Clean up the MD5 key list, if any */
2539 	if (tp->md5sig_info) {
2540 		struct tcp_md5sig_info *md5sig;
2541 
2542 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2543 		tcp_clear_md5_list(sk);
2544 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2545 		rcu_assign_pointer(tp->md5sig_info, NULL);
2546 	}
2547 #endif
2548 	tcp_ao_destroy_sock(sk, false);
2549 
2550 	/* Clean up a referenced TCP bind bucket. */
2551 	if (inet_csk(sk)->icsk_bind_hash)
2552 		inet_put_port(sk);
2553 
2554 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2555 
2556 	/* If socket is aborted during connect operation */
2557 	tcp_free_fastopen_req(tp);
2558 	tcp_fastopen_destroy_cipher(sk);
2559 	tcp_saved_syn_free(tp);
2560 
2561 	sk_sockets_allocated_dec(sk);
2562 }
2563 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2564 
2565 #ifdef CONFIG_PROC_FS
2566 /* Proc filesystem TCP sock list dumping. */
2567 
2568 static unsigned short seq_file_family(const struct seq_file *seq);
2569 
2570 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2571 {
2572 	unsigned short family = seq_file_family(seq);
2573 
2574 	/* AF_UNSPEC is used as a match all */
2575 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2576 		net_eq(sock_net(sk), seq_file_net(seq)));
2577 }
2578 
2579 /* Find a non empty bucket (starting from st->bucket)
2580  * and return the first sk from it.
2581  */
2582 static void *listening_get_first(struct seq_file *seq)
2583 {
2584 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2585 	struct tcp_iter_state *st = seq->private;
2586 
2587 	st->offset = 0;
2588 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2589 		struct inet_listen_hashbucket *ilb2;
2590 		struct hlist_nulls_node *node;
2591 		struct sock *sk;
2592 
2593 		ilb2 = &hinfo->lhash2[st->bucket];
2594 		if (hlist_nulls_empty(&ilb2->nulls_head))
2595 			continue;
2596 
2597 		spin_lock(&ilb2->lock);
2598 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2599 			if (seq_sk_match(seq, sk))
2600 				return sk;
2601 		}
2602 		spin_unlock(&ilb2->lock);
2603 	}
2604 
2605 	return NULL;
2606 }
2607 
2608 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2609  * If "cur" is the last one in the st->bucket,
2610  * call listening_get_first() to return the first sk of the next
2611  * non empty bucket.
2612  */
2613 static void *listening_get_next(struct seq_file *seq, void *cur)
2614 {
2615 	struct tcp_iter_state *st = seq->private;
2616 	struct inet_listen_hashbucket *ilb2;
2617 	struct hlist_nulls_node *node;
2618 	struct inet_hashinfo *hinfo;
2619 	struct sock *sk = cur;
2620 
2621 	++st->num;
2622 	++st->offset;
2623 
2624 	sk = sk_nulls_next(sk);
2625 	sk_nulls_for_each_from(sk, node) {
2626 		if (seq_sk_match(seq, sk))
2627 			return sk;
2628 	}
2629 
2630 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2631 	ilb2 = &hinfo->lhash2[st->bucket];
2632 	spin_unlock(&ilb2->lock);
2633 	++st->bucket;
2634 	return listening_get_first(seq);
2635 }
2636 
2637 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2638 {
2639 	struct tcp_iter_state *st = seq->private;
2640 	void *rc;
2641 
2642 	st->bucket = 0;
2643 	st->offset = 0;
2644 	rc = listening_get_first(seq);
2645 
2646 	while (rc && *pos) {
2647 		rc = listening_get_next(seq, rc);
2648 		--*pos;
2649 	}
2650 	return rc;
2651 }
2652 
2653 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2654 				const struct tcp_iter_state *st)
2655 {
2656 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2657 }
2658 
2659 /*
2660  * Get first established socket starting from bucket given in st->bucket.
2661  * If st->bucket is zero, the very first socket in the hash is returned.
2662  */
2663 static void *established_get_first(struct seq_file *seq)
2664 {
2665 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2666 	struct tcp_iter_state *st = seq->private;
2667 
2668 	st->offset = 0;
2669 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2670 		struct sock *sk;
2671 		struct hlist_nulls_node *node;
2672 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2673 
2674 		cond_resched();
2675 
2676 		/* Lockless fast path for the common case of empty buckets */
2677 		if (empty_bucket(hinfo, st))
2678 			continue;
2679 
2680 		spin_lock_bh(lock);
2681 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2682 			if (seq_sk_match(seq, sk))
2683 				return sk;
2684 		}
2685 		spin_unlock_bh(lock);
2686 	}
2687 
2688 	return NULL;
2689 }
2690 
2691 static void *established_get_next(struct seq_file *seq, void *cur)
2692 {
2693 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2694 	struct tcp_iter_state *st = seq->private;
2695 	struct hlist_nulls_node *node;
2696 	struct sock *sk = cur;
2697 
2698 	++st->num;
2699 	++st->offset;
2700 
2701 	sk = sk_nulls_next(sk);
2702 
2703 	sk_nulls_for_each_from(sk, node) {
2704 		if (seq_sk_match(seq, sk))
2705 			return sk;
2706 	}
2707 
2708 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2709 	++st->bucket;
2710 	return established_get_first(seq);
2711 }
2712 
2713 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2714 {
2715 	struct tcp_iter_state *st = seq->private;
2716 	void *rc;
2717 
2718 	st->bucket = 0;
2719 	rc = established_get_first(seq);
2720 
2721 	while (rc && pos) {
2722 		rc = established_get_next(seq, rc);
2723 		--pos;
2724 	}
2725 	return rc;
2726 }
2727 
2728 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2729 {
2730 	void *rc;
2731 	struct tcp_iter_state *st = seq->private;
2732 
2733 	st->state = TCP_SEQ_STATE_LISTENING;
2734 	rc	  = listening_get_idx(seq, &pos);
2735 
2736 	if (!rc) {
2737 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2738 		rc	  = established_get_idx(seq, pos);
2739 	}
2740 
2741 	return rc;
2742 }
2743 
2744 static void *tcp_seek_last_pos(struct seq_file *seq)
2745 {
2746 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2747 	struct tcp_iter_state *st = seq->private;
2748 	int bucket = st->bucket;
2749 	int offset = st->offset;
2750 	int orig_num = st->num;
2751 	void *rc = NULL;
2752 
2753 	switch (st->state) {
2754 	case TCP_SEQ_STATE_LISTENING:
2755 		if (st->bucket > hinfo->lhash2_mask)
2756 			break;
2757 		rc = listening_get_first(seq);
2758 		while (offset-- && rc && bucket == st->bucket)
2759 			rc = listening_get_next(seq, rc);
2760 		if (rc)
2761 			break;
2762 		st->bucket = 0;
2763 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2764 		fallthrough;
2765 	case TCP_SEQ_STATE_ESTABLISHED:
2766 		if (st->bucket > hinfo->ehash_mask)
2767 			break;
2768 		rc = established_get_first(seq);
2769 		while (offset-- && rc && bucket == st->bucket)
2770 			rc = established_get_next(seq, rc);
2771 	}
2772 
2773 	st->num = orig_num;
2774 
2775 	return rc;
2776 }
2777 
2778 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2779 {
2780 	struct tcp_iter_state *st = seq->private;
2781 	void *rc;
2782 
2783 	if (*pos && *pos == st->last_pos) {
2784 		rc = tcp_seek_last_pos(seq);
2785 		if (rc)
2786 			goto out;
2787 	}
2788 
2789 	st->state = TCP_SEQ_STATE_LISTENING;
2790 	st->num = 0;
2791 	st->bucket = 0;
2792 	st->offset = 0;
2793 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2794 
2795 out:
2796 	st->last_pos = *pos;
2797 	return rc;
2798 }
2799 EXPORT_SYMBOL(tcp_seq_start);
2800 
2801 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2802 {
2803 	struct tcp_iter_state *st = seq->private;
2804 	void *rc = NULL;
2805 
2806 	if (v == SEQ_START_TOKEN) {
2807 		rc = tcp_get_idx(seq, 0);
2808 		goto out;
2809 	}
2810 
2811 	switch (st->state) {
2812 	case TCP_SEQ_STATE_LISTENING:
2813 		rc = listening_get_next(seq, v);
2814 		if (!rc) {
2815 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2816 			st->bucket = 0;
2817 			st->offset = 0;
2818 			rc	  = established_get_first(seq);
2819 		}
2820 		break;
2821 	case TCP_SEQ_STATE_ESTABLISHED:
2822 		rc = established_get_next(seq, v);
2823 		break;
2824 	}
2825 out:
2826 	++*pos;
2827 	st->last_pos = *pos;
2828 	return rc;
2829 }
2830 EXPORT_SYMBOL(tcp_seq_next);
2831 
2832 void tcp_seq_stop(struct seq_file *seq, void *v)
2833 {
2834 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2835 	struct tcp_iter_state *st = seq->private;
2836 
2837 	switch (st->state) {
2838 	case TCP_SEQ_STATE_LISTENING:
2839 		if (v != SEQ_START_TOKEN)
2840 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2841 		break;
2842 	case TCP_SEQ_STATE_ESTABLISHED:
2843 		if (v)
2844 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2845 		break;
2846 	}
2847 }
2848 EXPORT_SYMBOL(tcp_seq_stop);
2849 
2850 static void get_openreq4(const struct request_sock *req,
2851 			 struct seq_file *f, int i)
2852 {
2853 	const struct inet_request_sock *ireq = inet_rsk(req);
2854 	long delta = req->rsk_timer.expires - jiffies;
2855 
2856 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2857 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2858 		i,
2859 		ireq->ir_loc_addr,
2860 		ireq->ir_num,
2861 		ireq->ir_rmt_addr,
2862 		ntohs(ireq->ir_rmt_port),
2863 		TCP_SYN_RECV,
2864 		0, 0, /* could print option size, but that is af dependent. */
2865 		1,    /* timers active (only the expire timer) */
2866 		jiffies_delta_to_clock_t(delta),
2867 		req->num_timeout,
2868 		from_kuid_munged(seq_user_ns(f),
2869 				 sock_i_uid(req->rsk_listener)),
2870 		0,  /* non standard timer */
2871 		0, /* open_requests have no inode */
2872 		0,
2873 		req);
2874 }
2875 
2876 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2877 {
2878 	int timer_active;
2879 	unsigned long timer_expires;
2880 	const struct tcp_sock *tp = tcp_sk(sk);
2881 	const struct inet_connection_sock *icsk = inet_csk(sk);
2882 	const struct inet_sock *inet = inet_sk(sk);
2883 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2884 	__be32 dest = inet->inet_daddr;
2885 	__be32 src = inet->inet_rcv_saddr;
2886 	__u16 destp = ntohs(inet->inet_dport);
2887 	__u16 srcp = ntohs(inet->inet_sport);
2888 	int rx_queue;
2889 	int state;
2890 
2891 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2892 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2893 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2894 		timer_active	= 1;
2895 		timer_expires	= icsk->icsk_timeout;
2896 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2897 		timer_active	= 4;
2898 		timer_expires	= icsk->icsk_timeout;
2899 	} else if (timer_pending(&sk->sk_timer)) {
2900 		timer_active	= 2;
2901 		timer_expires	= sk->sk_timer.expires;
2902 	} else {
2903 		timer_active	= 0;
2904 		timer_expires = jiffies;
2905 	}
2906 
2907 	state = inet_sk_state_load(sk);
2908 	if (state == TCP_LISTEN)
2909 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2910 	else
2911 		/* Because we don't lock the socket,
2912 		 * we might find a transient negative value.
2913 		 */
2914 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2915 				      READ_ONCE(tp->copied_seq), 0);
2916 
2917 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2918 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2919 		i, src, srcp, dest, destp, state,
2920 		READ_ONCE(tp->write_seq) - tp->snd_una,
2921 		rx_queue,
2922 		timer_active,
2923 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2924 		icsk->icsk_retransmits,
2925 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2926 		icsk->icsk_probes_out,
2927 		sock_i_ino(sk),
2928 		refcount_read(&sk->sk_refcnt), sk,
2929 		jiffies_to_clock_t(icsk->icsk_rto),
2930 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2931 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2932 		tcp_snd_cwnd(tp),
2933 		state == TCP_LISTEN ?
2934 		    fastopenq->max_qlen :
2935 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2936 }
2937 
2938 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2939 			       struct seq_file *f, int i)
2940 {
2941 	long delta = tw->tw_timer.expires - jiffies;
2942 	__be32 dest, src;
2943 	__u16 destp, srcp;
2944 
2945 	dest  = tw->tw_daddr;
2946 	src   = tw->tw_rcv_saddr;
2947 	destp = ntohs(tw->tw_dport);
2948 	srcp  = ntohs(tw->tw_sport);
2949 
2950 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2951 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2952 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2953 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2954 		refcount_read(&tw->tw_refcnt), tw);
2955 }
2956 
2957 #define TMPSZ 150
2958 
2959 static int tcp4_seq_show(struct seq_file *seq, void *v)
2960 {
2961 	struct tcp_iter_state *st;
2962 	struct sock *sk = v;
2963 
2964 	seq_setwidth(seq, TMPSZ - 1);
2965 	if (v == SEQ_START_TOKEN) {
2966 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2967 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2968 			   "inode");
2969 		goto out;
2970 	}
2971 	st = seq->private;
2972 
2973 	if (sk->sk_state == TCP_TIME_WAIT)
2974 		get_timewait4_sock(v, seq, st->num);
2975 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2976 		get_openreq4(v, seq, st->num);
2977 	else
2978 		get_tcp4_sock(v, seq, st->num);
2979 out:
2980 	seq_pad(seq, '\n');
2981 	return 0;
2982 }
2983 
2984 #ifdef CONFIG_BPF_SYSCALL
2985 struct bpf_tcp_iter_state {
2986 	struct tcp_iter_state state;
2987 	unsigned int cur_sk;
2988 	unsigned int end_sk;
2989 	unsigned int max_sk;
2990 	struct sock **batch;
2991 	bool st_bucket_done;
2992 };
2993 
2994 struct bpf_iter__tcp {
2995 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2996 	__bpf_md_ptr(struct sock_common *, sk_common);
2997 	uid_t uid __aligned(8);
2998 };
2999 
3000 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3001 			     struct sock_common *sk_common, uid_t uid)
3002 {
3003 	struct bpf_iter__tcp ctx;
3004 
3005 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3006 	ctx.meta = meta;
3007 	ctx.sk_common = sk_common;
3008 	ctx.uid = uid;
3009 	return bpf_iter_run_prog(prog, &ctx);
3010 }
3011 
3012 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3013 {
3014 	while (iter->cur_sk < iter->end_sk)
3015 		sock_gen_put(iter->batch[iter->cur_sk++]);
3016 }
3017 
3018 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3019 				      unsigned int new_batch_sz)
3020 {
3021 	struct sock **new_batch;
3022 
3023 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3024 			     GFP_USER | __GFP_NOWARN);
3025 	if (!new_batch)
3026 		return -ENOMEM;
3027 
3028 	bpf_iter_tcp_put_batch(iter);
3029 	kvfree(iter->batch);
3030 	iter->batch = new_batch;
3031 	iter->max_sk = new_batch_sz;
3032 
3033 	return 0;
3034 }
3035 
3036 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3037 						 struct sock *start_sk)
3038 {
3039 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3040 	struct bpf_tcp_iter_state *iter = seq->private;
3041 	struct tcp_iter_state *st = &iter->state;
3042 	struct hlist_nulls_node *node;
3043 	unsigned int expected = 1;
3044 	struct sock *sk;
3045 
3046 	sock_hold(start_sk);
3047 	iter->batch[iter->end_sk++] = start_sk;
3048 
3049 	sk = sk_nulls_next(start_sk);
3050 	sk_nulls_for_each_from(sk, node) {
3051 		if (seq_sk_match(seq, sk)) {
3052 			if (iter->end_sk < iter->max_sk) {
3053 				sock_hold(sk);
3054 				iter->batch[iter->end_sk++] = sk;
3055 			}
3056 			expected++;
3057 		}
3058 	}
3059 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3060 
3061 	return expected;
3062 }
3063 
3064 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3065 						   struct sock *start_sk)
3066 {
3067 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3068 	struct bpf_tcp_iter_state *iter = seq->private;
3069 	struct tcp_iter_state *st = &iter->state;
3070 	struct hlist_nulls_node *node;
3071 	unsigned int expected = 1;
3072 	struct sock *sk;
3073 
3074 	sock_hold(start_sk);
3075 	iter->batch[iter->end_sk++] = start_sk;
3076 
3077 	sk = sk_nulls_next(start_sk);
3078 	sk_nulls_for_each_from(sk, node) {
3079 		if (seq_sk_match(seq, sk)) {
3080 			if (iter->end_sk < iter->max_sk) {
3081 				sock_hold(sk);
3082 				iter->batch[iter->end_sk++] = sk;
3083 			}
3084 			expected++;
3085 		}
3086 	}
3087 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3088 
3089 	return expected;
3090 }
3091 
3092 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3093 {
3094 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3095 	struct bpf_tcp_iter_state *iter = seq->private;
3096 	struct tcp_iter_state *st = &iter->state;
3097 	unsigned int expected;
3098 	bool resized = false;
3099 	struct sock *sk;
3100 
3101 	/* The st->bucket is done.  Directly advance to the next
3102 	 * bucket instead of having the tcp_seek_last_pos() to skip
3103 	 * one by one in the current bucket and eventually find out
3104 	 * it has to advance to the next bucket.
3105 	 */
3106 	if (iter->st_bucket_done) {
3107 		st->offset = 0;
3108 		st->bucket++;
3109 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3110 		    st->bucket > hinfo->lhash2_mask) {
3111 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3112 			st->bucket = 0;
3113 		}
3114 	}
3115 
3116 again:
3117 	/* Get a new batch */
3118 	iter->cur_sk = 0;
3119 	iter->end_sk = 0;
3120 	iter->st_bucket_done = false;
3121 
3122 	sk = tcp_seek_last_pos(seq);
3123 	if (!sk)
3124 		return NULL; /* Done */
3125 
3126 	if (st->state == TCP_SEQ_STATE_LISTENING)
3127 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3128 	else
3129 		expected = bpf_iter_tcp_established_batch(seq, sk);
3130 
3131 	if (iter->end_sk == expected) {
3132 		iter->st_bucket_done = true;
3133 		return sk;
3134 	}
3135 
3136 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3137 		resized = true;
3138 		goto again;
3139 	}
3140 
3141 	return sk;
3142 }
3143 
3144 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3145 {
3146 	/* bpf iter does not support lseek, so it always
3147 	 * continue from where it was stop()-ped.
3148 	 */
3149 	if (*pos)
3150 		return bpf_iter_tcp_batch(seq);
3151 
3152 	return SEQ_START_TOKEN;
3153 }
3154 
3155 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3156 {
3157 	struct bpf_tcp_iter_state *iter = seq->private;
3158 	struct tcp_iter_state *st = &iter->state;
3159 	struct sock *sk;
3160 
3161 	/* Whenever seq_next() is called, the iter->cur_sk is
3162 	 * done with seq_show(), so advance to the next sk in
3163 	 * the batch.
3164 	 */
3165 	if (iter->cur_sk < iter->end_sk) {
3166 		/* Keeping st->num consistent in tcp_iter_state.
3167 		 * bpf_iter_tcp does not use st->num.
3168 		 * meta.seq_num is used instead.
3169 		 */
3170 		st->num++;
3171 		/* Move st->offset to the next sk in the bucket such that
3172 		 * the future start() will resume at st->offset in
3173 		 * st->bucket.  See tcp_seek_last_pos().
3174 		 */
3175 		st->offset++;
3176 		sock_gen_put(iter->batch[iter->cur_sk++]);
3177 	}
3178 
3179 	if (iter->cur_sk < iter->end_sk)
3180 		sk = iter->batch[iter->cur_sk];
3181 	else
3182 		sk = bpf_iter_tcp_batch(seq);
3183 
3184 	++*pos;
3185 	/* Keeping st->last_pos consistent in tcp_iter_state.
3186 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3187 	 */
3188 	st->last_pos = *pos;
3189 	return sk;
3190 }
3191 
3192 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3193 {
3194 	struct bpf_iter_meta meta;
3195 	struct bpf_prog *prog;
3196 	struct sock *sk = v;
3197 	uid_t uid;
3198 	int ret;
3199 
3200 	if (v == SEQ_START_TOKEN)
3201 		return 0;
3202 
3203 	if (sk_fullsock(sk))
3204 		lock_sock(sk);
3205 
3206 	if (unlikely(sk_unhashed(sk))) {
3207 		ret = SEQ_SKIP;
3208 		goto unlock;
3209 	}
3210 
3211 	if (sk->sk_state == TCP_TIME_WAIT) {
3212 		uid = 0;
3213 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3214 		const struct request_sock *req = v;
3215 
3216 		uid = from_kuid_munged(seq_user_ns(seq),
3217 				       sock_i_uid(req->rsk_listener));
3218 	} else {
3219 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3220 	}
3221 
3222 	meta.seq = seq;
3223 	prog = bpf_iter_get_info(&meta, false);
3224 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3225 
3226 unlock:
3227 	if (sk_fullsock(sk))
3228 		release_sock(sk);
3229 	return ret;
3230 
3231 }
3232 
3233 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3234 {
3235 	struct bpf_tcp_iter_state *iter = seq->private;
3236 	struct bpf_iter_meta meta;
3237 	struct bpf_prog *prog;
3238 
3239 	if (!v) {
3240 		meta.seq = seq;
3241 		prog = bpf_iter_get_info(&meta, true);
3242 		if (prog)
3243 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3244 	}
3245 
3246 	if (iter->cur_sk < iter->end_sk) {
3247 		bpf_iter_tcp_put_batch(iter);
3248 		iter->st_bucket_done = false;
3249 	}
3250 }
3251 
3252 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3253 	.show		= bpf_iter_tcp_seq_show,
3254 	.start		= bpf_iter_tcp_seq_start,
3255 	.next		= bpf_iter_tcp_seq_next,
3256 	.stop		= bpf_iter_tcp_seq_stop,
3257 };
3258 #endif
3259 static unsigned short seq_file_family(const struct seq_file *seq)
3260 {
3261 	const struct tcp_seq_afinfo *afinfo;
3262 
3263 #ifdef CONFIG_BPF_SYSCALL
3264 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3265 	if (seq->op == &bpf_iter_tcp_seq_ops)
3266 		return AF_UNSPEC;
3267 #endif
3268 
3269 	/* Iterated from proc fs */
3270 	afinfo = pde_data(file_inode(seq->file));
3271 	return afinfo->family;
3272 }
3273 
3274 static const struct seq_operations tcp4_seq_ops = {
3275 	.show		= tcp4_seq_show,
3276 	.start		= tcp_seq_start,
3277 	.next		= tcp_seq_next,
3278 	.stop		= tcp_seq_stop,
3279 };
3280 
3281 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3282 	.family		= AF_INET,
3283 };
3284 
3285 static int __net_init tcp4_proc_init_net(struct net *net)
3286 {
3287 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3288 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3289 		return -ENOMEM;
3290 	return 0;
3291 }
3292 
3293 static void __net_exit tcp4_proc_exit_net(struct net *net)
3294 {
3295 	remove_proc_entry("tcp", net->proc_net);
3296 }
3297 
3298 static struct pernet_operations tcp4_net_ops = {
3299 	.init = tcp4_proc_init_net,
3300 	.exit = tcp4_proc_exit_net,
3301 };
3302 
3303 int __init tcp4_proc_init(void)
3304 {
3305 	return register_pernet_subsys(&tcp4_net_ops);
3306 }
3307 
3308 void tcp4_proc_exit(void)
3309 {
3310 	unregister_pernet_subsys(&tcp4_net_ops);
3311 }
3312 #endif /* CONFIG_PROC_FS */
3313 
3314 /* @wake is one when sk_stream_write_space() calls us.
3315  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3316  * This mimics the strategy used in sock_def_write_space().
3317  */
3318 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3319 {
3320 	const struct tcp_sock *tp = tcp_sk(sk);
3321 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3322 			    READ_ONCE(tp->snd_nxt);
3323 
3324 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3325 }
3326 EXPORT_SYMBOL(tcp_stream_memory_free);
3327 
3328 struct proto tcp_prot = {
3329 	.name			= "TCP",
3330 	.owner			= THIS_MODULE,
3331 	.close			= tcp_close,
3332 	.pre_connect		= tcp_v4_pre_connect,
3333 	.connect		= tcp_v4_connect,
3334 	.disconnect		= tcp_disconnect,
3335 	.accept			= inet_csk_accept,
3336 	.ioctl			= tcp_ioctl,
3337 	.init			= tcp_v4_init_sock,
3338 	.destroy		= tcp_v4_destroy_sock,
3339 	.shutdown		= tcp_shutdown,
3340 	.setsockopt		= tcp_setsockopt,
3341 	.getsockopt		= tcp_getsockopt,
3342 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3343 	.keepalive		= tcp_set_keepalive,
3344 	.recvmsg		= tcp_recvmsg,
3345 	.sendmsg		= tcp_sendmsg,
3346 	.splice_eof		= tcp_splice_eof,
3347 	.backlog_rcv		= tcp_v4_do_rcv,
3348 	.release_cb		= tcp_release_cb,
3349 	.hash			= inet_hash,
3350 	.unhash			= inet_unhash,
3351 	.get_port		= inet_csk_get_port,
3352 	.put_port		= inet_put_port,
3353 #ifdef CONFIG_BPF_SYSCALL
3354 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3355 #endif
3356 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3357 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3358 	.stream_memory_free	= tcp_stream_memory_free,
3359 	.sockets_allocated	= &tcp_sockets_allocated,
3360 	.orphan_count		= &tcp_orphan_count,
3361 
3362 	.memory_allocated	= &tcp_memory_allocated,
3363 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3364 
3365 	.memory_pressure	= &tcp_memory_pressure,
3366 	.sysctl_mem		= sysctl_tcp_mem,
3367 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3368 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3369 	.max_header		= MAX_TCP_HEADER,
3370 	.obj_size		= sizeof(struct tcp_sock),
3371 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3372 	.twsk_prot		= &tcp_timewait_sock_ops,
3373 	.rsk_prot		= &tcp_request_sock_ops,
3374 	.h.hashinfo		= NULL,
3375 	.no_autobind		= true,
3376 	.diag_destroy		= tcp_abort,
3377 };
3378 EXPORT_SYMBOL(tcp_prot);
3379 
3380 static void __net_exit tcp_sk_exit(struct net *net)
3381 {
3382 	if (net->ipv4.tcp_congestion_control)
3383 		bpf_module_put(net->ipv4.tcp_congestion_control,
3384 			       net->ipv4.tcp_congestion_control->owner);
3385 }
3386 
3387 static void __net_init tcp_set_hashinfo(struct net *net)
3388 {
3389 	struct inet_hashinfo *hinfo;
3390 	unsigned int ehash_entries;
3391 	struct net *old_net;
3392 
3393 	if (net_eq(net, &init_net))
3394 		goto fallback;
3395 
3396 	old_net = current->nsproxy->net_ns;
3397 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3398 	if (!ehash_entries)
3399 		goto fallback;
3400 
3401 	ehash_entries = roundup_pow_of_two(ehash_entries);
3402 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3403 	if (!hinfo) {
3404 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3405 			"for a netns, fallback to the global one\n",
3406 			ehash_entries);
3407 fallback:
3408 		hinfo = &tcp_hashinfo;
3409 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3410 	}
3411 
3412 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3413 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3414 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3415 }
3416 
3417 static int __net_init tcp_sk_init(struct net *net)
3418 {
3419 	net->ipv4.sysctl_tcp_ecn = 2;
3420 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3421 
3422 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3423 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3424 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3425 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3426 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3427 
3428 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3429 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3430 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3431 
3432 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3433 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3434 	net->ipv4.sysctl_tcp_syncookies = 1;
3435 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3436 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3437 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3438 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3439 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3440 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3441 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3442 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3443 
3444 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3445 	tcp_set_hashinfo(net);
3446 
3447 	net->ipv4.sysctl_tcp_sack = 1;
3448 	net->ipv4.sysctl_tcp_window_scaling = 1;
3449 	net->ipv4.sysctl_tcp_timestamps = 1;
3450 	net->ipv4.sysctl_tcp_early_retrans = 3;
3451 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3452 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3453 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3454 	net->ipv4.sysctl_tcp_max_reordering = 300;
3455 	net->ipv4.sysctl_tcp_dsack = 1;
3456 	net->ipv4.sysctl_tcp_app_win = 31;
3457 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3458 	net->ipv4.sysctl_tcp_frto = 2;
3459 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3460 	/* This limits the percentage of the congestion window which we
3461 	 * will allow a single TSO frame to consume.  Building TSO frames
3462 	 * which are too large can cause TCP streams to be bursty.
3463 	 */
3464 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3465 	/* Default TSQ limit of 16 TSO segments */
3466 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3467 
3468 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3469 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3470 
3471 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3472 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3473 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3474 	net->ipv4.sysctl_tcp_autocorking = 1;
3475 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3476 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3477 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3478 	if (net != &init_net) {
3479 		memcpy(net->ipv4.sysctl_tcp_rmem,
3480 		       init_net.ipv4.sysctl_tcp_rmem,
3481 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3482 		memcpy(net->ipv4.sysctl_tcp_wmem,
3483 		       init_net.ipv4.sysctl_tcp_wmem,
3484 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3485 	}
3486 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3487 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3488 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3489 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3490 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3491 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3492 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3493 
3494 	/* Set default values for PLB */
3495 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3496 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3497 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3498 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3499 	/* Default congestion threshold for PLB to mark a round is 50% */
3500 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3501 
3502 	/* Reno is always built in */
3503 	if (!net_eq(net, &init_net) &&
3504 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3505 			       init_net.ipv4.tcp_congestion_control->owner))
3506 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3507 	else
3508 		net->ipv4.tcp_congestion_control = &tcp_reno;
3509 
3510 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3511 	net->ipv4.sysctl_tcp_shrink_window = 0;
3512 
3513 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3514 
3515 	return 0;
3516 }
3517 
3518 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3519 {
3520 	struct net *net;
3521 
3522 	tcp_twsk_purge(net_exit_list);
3523 
3524 	list_for_each_entry(net, net_exit_list, exit_list) {
3525 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3526 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3527 		tcp_fastopen_ctx_destroy(net);
3528 	}
3529 }
3530 
3531 static struct pernet_operations __net_initdata tcp_sk_ops = {
3532        .init	   = tcp_sk_init,
3533        .exit	   = tcp_sk_exit,
3534        .exit_batch = tcp_sk_exit_batch,
3535 };
3536 
3537 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3538 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3539 		     struct sock_common *sk_common, uid_t uid)
3540 
3541 #define INIT_BATCH_SZ 16
3542 
3543 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3544 {
3545 	struct bpf_tcp_iter_state *iter = priv_data;
3546 	int err;
3547 
3548 	err = bpf_iter_init_seq_net(priv_data, aux);
3549 	if (err)
3550 		return err;
3551 
3552 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3553 	if (err) {
3554 		bpf_iter_fini_seq_net(priv_data);
3555 		return err;
3556 	}
3557 
3558 	return 0;
3559 }
3560 
3561 static void bpf_iter_fini_tcp(void *priv_data)
3562 {
3563 	struct bpf_tcp_iter_state *iter = priv_data;
3564 
3565 	bpf_iter_fini_seq_net(priv_data);
3566 	kvfree(iter->batch);
3567 }
3568 
3569 static const struct bpf_iter_seq_info tcp_seq_info = {
3570 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3571 	.init_seq_private	= bpf_iter_init_tcp,
3572 	.fini_seq_private	= bpf_iter_fini_tcp,
3573 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3574 };
3575 
3576 static const struct bpf_func_proto *
3577 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3578 			    const struct bpf_prog *prog)
3579 {
3580 	switch (func_id) {
3581 	case BPF_FUNC_setsockopt:
3582 		return &bpf_sk_setsockopt_proto;
3583 	case BPF_FUNC_getsockopt:
3584 		return &bpf_sk_getsockopt_proto;
3585 	default:
3586 		return NULL;
3587 	}
3588 }
3589 
3590 static struct bpf_iter_reg tcp_reg_info = {
3591 	.target			= "tcp",
3592 	.ctx_arg_info_size	= 1,
3593 	.ctx_arg_info		= {
3594 		{ offsetof(struct bpf_iter__tcp, sk_common),
3595 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3596 	},
3597 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3598 	.seq_info		= &tcp_seq_info,
3599 };
3600 
3601 static void __init bpf_iter_register(void)
3602 {
3603 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3604 	if (bpf_iter_reg_target(&tcp_reg_info))
3605 		pr_warn("Warning: could not register bpf iterator tcp\n");
3606 }
3607 
3608 #endif
3609 
3610 void __init tcp_v4_init(void)
3611 {
3612 	int cpu, res;
3613 
3614 	for_each_possible_cpu(cpu) {
3615 		struct sock *sk;
3616 
3617 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3618 					   IPPROTO_TCP, &init_net);
3619 		if (res)
3620 			panic("Failed to create the TCP control socket.\n");
3621 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3622 
3623 		/* Please enforce IP_DF and IPID==0 for RST and
3624 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3625 		 */
3626 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3627 
3628 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3629 	}
3630 	if (register_pernet_subsys(&tcp_sk_ops))
3631 		panic("Failed to create the TCP control socket.\n");
3632 
3633 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3634 	bpf_iter_register();
3635 #endif
3636 }
3637