xref: /linux/net/ipv4/tcp_ipv4.c (revision 8f5b5f78113e881cb8570c961b0dc42b218a1b9e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81 
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84 
85 #include <trace/events/tcp.h>
86 
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91 
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94 
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 
97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99 	return secure_tcp_seq(ip_hdr(skb)->daddr,
100 			      ip_hdr(skb)->saddr,
101 			      tcp_hdr(skb)->dest,
102 			      tcp_hdr(skb)->source);
103 }
104 
105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 {
107 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108 }
109 
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 	struct tcp_sock *tp = tcp_sk(sk);
116 
117 	if (reuse == 2) {
118 		/* Still does not detect *everything* that goes through
119 		 * lo, since we require a loopback src or dst address
120 		 * or direct binding to 'lo' interface.
121 		 */
122 		bool loopback = false;
123 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 			loopback = true;
125 #if IS_ENABLED(CONFIG_IPV6)
126 		if (tw->tw_family == AF_INET6) {
127 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131 				loopback = true;
132 		} else
133 #endif
134 		{
135 			if (ipv4_is_loopback(tw->tw_daddr) ||
136 			    ipv4_is_loopback(tw->tw_rcv_saddr))
137 				loopback = true;
138 		}
139 		if (!loopback)
140 			reuse = 0;
141 	}
142 
143 	/* With PAWS, it is safe from the viewpoint
144 	   of data integrity. Even without PAWS it is safe provided sequence
145 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 
147 	   Actually, the idea is close to VJ's one, only timestamp cache is
148 	   held not per host, but per port pair and TW bucket is used as state
149 	   holder.
150 
151 	   If TW bucket has been already destroyed we fall back to VJ's scheme
152 	   and use initial timestamp retrieved from peer table.
153 	 */
154 	if (tcptw->tw_ts_recent_stamp &&
155 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
156 					    tcptw->tw_ts_recent_stamp)))) {
157 		/* inet_twsk_hashdance() sets sk_refcnt after putting twsk
158 		 * and releasing the bucket lock.
159 		 */
160 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
161 			return 0;
162 
163 		/* In case of repair and re-using TIME-WAIT sockets we still
164 		 * want to be sure that it is safe as above but honor the
165 		 * sequence numbers and time stamps set as part of the repair
166 		 * process.
167 		 *
168 		 * Without this check re-using a TIME-WAIT socket with TCP
169 		 * repair would accumulate a -1 on the repair assigned
170 		 * sequence number. The first time it is reused the sequence
171 		 * is -1, the second time -2, etc. This fixes that issue
172 		 * without appearing to create any others.
173 		 */
174 		if (likely(!tp->repair)) {
175 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
176 
177 			if (!seq)
178 				seq = 1;
179 			WRITE_ONCE(tp->write_seq, seq);
180 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
181 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
182 		}
183 
184 		return 1;
185 	}
186 
187 	return 0;
188 }
189 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
190 
191 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
192 			      int addr_len)
193 {
194 	/* This check is replicated from tcp_v4_connect() and intended to
195 	 * prevent BPF program called below from accessing bytes that are out
196 	 * of the bound specified by user in addr_len.
197 	 */
198 	if (addr_len < sizeof(struct sockaddr_in))
199 		return -EINVAL;
200 
201 	sock_owned_by_me(sk);
202 
203 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
204 }
205 
206 /* This will initiate an outgoing connection. */
207 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
208 {
209 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
210 	struct inet_timewait_death_row *tcp_death_row;
211 	struct inet_sock *inet = inet_sk(sk);
212 	struct tcp_sock *tp = tcp_sk(sk);
213 	struct ip_options_rcu *inet_opt;
214 	struct net *net = sock_net(sk);
215 	__be16 orig_sport, orig_dport;
216 	__be32 daddr, nexthop;
217 	struct flowi4 *fl4;
218 	struct rtable *rt;
219 	int err;
220 
221 	if (addr_len < sizeof(struct sockaddr_in))
222 		return -EINVAL;
223 
224 	if (usin->sin_family != AF_INET)
225 		return -EAFNOSUPPORT;
226 
227 	nexthop = daddr = usin->sin_addr.s_addr;
228 	inet_opt = rcu_dereference_protected(inet->inet_opt,
229 					     lockdep_sock_is_held(sk));
230 	if (inet_opt && inet_opt->opt.srr) {
231 		if (!daddr)
232 			return -EINVAL;
233 		nexthop = inet_opt->opt.faddr;
234 	}
235 
236 	orig_sport = inet->inet_sport;
237 	orig_dport = usin->sin_port;
238 	fl4 = &inet->cork.fl.u.ip4;
239 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
240 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
241 			      orig_dport, sk);
242 	if (IS_ERR(rt)) {
243 		err = PTR_ERR(rt);
244 		if (err == -ENETUNREACH)
245 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
246 		return err;
247 	}
248 
249 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
250 		ip_rt_put(rt);
251 		return -ENETUNREACH;
252 	}
253 
254 	if (!inet_opt || !inet_opt->opt.srr)
255 		daddr = fl4->daddr;
256 
257 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
258 
259 	if (!inet->inet_saddr) {
260 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
261 		if (err) {
262 			ip_rt_put(rt);
263 			return err;
264 		}
265 	} else {
266 		sk_rcv_saddr_set(sk, inet->inet_saddr);
267 	}
268 
269 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
270 		/* Reset inherited state */
271 		tp->rx_opt.ts_recent	   = 0;
272 		tp->rx_opt.ts_recent_stamp = 0;
273 		if (likely(!tp->repair))
274 			WRITE_ONCE(tp->write_seq, 0);
275 	}
276 
277 	inet->inet_dport = usin->sin_port;
278 	sk_daddr_set(sk, daddr);
279 
280 	inet_csk(sk)->icsk_ext_hdr_len = 0;
281 	if (inet_opt)
282 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
283 
284 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
285 
286 	/* Socket identity is still unknown (sport may be zero).
287 	 * However we set state to SYN-SENT and not releasing socket
288 	 * lock select source port, enter ourselves into the hash tables and
289 	 * complete initialization after this.
290 	 */
291 	tcp_set_state(sk, TCP_SYN_SENT);
292 	err = inet_hash_connect(tcp_death_row, sk);
293 	if (err)
294 		goto failure;
295 
296 	sk_set_txhash(sk);
297 
298 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
299 			       inet->inet_sport, inet->inet_dport, sk);
300 	if (IS_ERR(rt)) {
301 		err = PTR_ERR(rt);
302 		rt = NULL;
303 		goto failure;
304 	}
305 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
306 	/* OK, now commit destination to socket.  */
307 	sk->sk_gso_type = SKB_GSO_TCPV4;
308 	sk_setup_caps(sk, &rt->dst);
309 	rt = NULL;
310 
311 	if (likely(!tp->repair)) {
312 		if (!tp->write_seq)
313 			WRITE_ONCE(tp->write_seq,
314 				   secure_tcp_seq(inet->inet_saddr,
315 						  inet->inet_daddr,
316 						  inet->inet_sport,
317 						  usin->sin_port));
318 		WRITE_ONCE(tp->tsoffset,
319 			   secure_tcp_ts_off(net, inet->inet_saddr,
320 					     inet->inet_daddr));
321 	}
322 
323 	atomic_set(&inet->inet_id, get_random_u16());
324 
325 	if (tcp_fastopen_defer_connect(sk, &err))
326 		return err;
327 	if (err)
328 		goto failure;
329 
330 	err = tcp_connect(sk);
331 
332 	if (err)
333 		goto failure;
334 
335 	return 0;
336 
337 failure:
338 	/*
339 	 * This unhashes the socket and releases the local port,
340 	 * if necessary.
341 	 */
342 	tcp_set_state(sk, TCP_CLOSE);
343 	inet_bhash2_reset_saddr(sk);
344 	ip_rt_put(rt);
345 	sk->sk_route_caps = 0;
346 	inet->inet_dport = 0;
347 	return err;
348 }
349 EXPORT_SYMBOL(tcp_v4_connect);
350 
351 /*
352  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
353  * It can be called through tcp_release_cb() if socket was owned by user
354  * at the time tcp_v4_err() was called to handle ICMP message.
355  */
356 void tcp_v4_mtu_reduced(struct sock *sk)
357 {
358 	struct inet_sock *inet = inet_sk(sk);
359 	struct dst_entry *dst;
360 	u32 mtu;
361 
362 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
363 		return;
364 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
365 	dst = inet_csk_update_pmtu(sk, mtu);
366 	if (!dst)
367 		return;
368 
369 	/* Something is about to be wrong... Remember soft error
370 	 * for the case, if this connection will not able to recover.
371 	 */
372 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
373 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
374 
375 	mtu = dst_mtu(dst);
376 
377 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
378 	    ip_sk_accept_pmtu(sk) &&
379 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
380 		tcp_sync_mss(sk, mtu);
381 
382 		/* Resend the TCP packet because it's
383 		 * clear that the old packet has been
384 		 * dropped. This is the new "fast" path mtu
385 		 * discovery.
386 		 */
387 		tcp_simple_retransmit(sk);
388 	} /* else let the usual retransmit timer handle it */
389 }
390 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
391 
392 static void do_redirect(struct sk_buff *skb, struct sock *sk)
393 {
394 	struct dst_entry *dst = __sk_dst_check(sk, 0);
395 
396 	if (dst)
397 		dst->ops->redirect(dst, sk, skb);
398 }
399 
400 
401 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
402 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
403 {
404 	struct request_sock *req = inet_reqsk(sk);
405 	struct net *net = sock_net(sk);
406 
407 	/* ICMPs are not backlogged, hence we cannot get
408 	 * an established socket here.
409 	 */
410 	if (seq != tcp_rsk(req)->snt_isn) {
411 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
412 	} else if (abort) {
413 		/*
414 		 * Still in SYN_RECV, just remove it silently.
415 		 * There is no good way to pass the error to the newly
416 		 * created socket, and POSIX does not want network
417 		 * errors returned from accept().
418 		 */
419 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
420 		tcp_listendrop(req->rsk_listener);
421 	}
422 	reqsk_put(req);
423 }
424 EXPORT_SYMBOL(tcp_req_err);
425 
426 /* TCP-LD (RFC 6069) logic */
427 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
428 {
429 	struct inet_connection_sock *icsk = inet_csk(sk);
430 	struct tcp_sock *tp = tcp_sk(sk);
431 	struct sk_buff *skb;
432 	s32 remaining;
433 	u32 delta_us;
434 
435 	if (sock_owned_by_user(sk))
436 		return;
437 
438 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
439 	    !icsk->icsk_backoff)
440 		return;
441 
442 	skb = tcp_rtx_queue_head(sk);
443 	if (WARN_ON_ONCE(!skb))
444 		return;
445 
446 	icsk->icsk_backoff--;
447 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
448 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
449 
450 	tcp_mstamp_refresh(tp);
451 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
452 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
453 
454 	if (remaining > 0) {
455 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
456 					  remaining, TCP_RTO_MAX);
457 	} else {
458 		/* RTO revert clocked out retransmission.
459 		 * Will retransmit now.
460 		 */
461 		tcp_retransmit_timer(sk);
462 	}
463 }
464 EXPORT_SYMBOL(tcp_ld_RTO_revert);
465 
466 /*
467  * This routine is called by the ICMP module when it gets some
468  * sort of error condition.  If err < 0 then the socket should
469  * be closed and the error returned to the user.  If err > 0
470  * it's just the icmp type << 8 | icmp code.  After adjustment
471  * header points to the first 8 bytes of the tcp header.  We need
472  * to find the appropriate port.
473  *
474  * The locking strategy used here is very "optimistic". When
475  * someone else accesses the socket the ICMP is just dropped
476  * and for some paths there is no check at all.
477  * A more general error queue to queue errors for later handling
478  * is probably better.
479  *
480  */
481 
482 int tcp_v4_err(struct sk_buff *skb, u32 info)
483 {
484 	const struct iphdr *iph = (const struct iphdr *)skb->data;
485 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
486 	struct tcp_sock *tp;
487 	const int type = icmp_hdr(skb)->type;
488 	const int code = icmp_hdr(skb)->code;
489 	struct sock *sk;
490 	struct request_sock *fastopen;
491 	u32 seq, snd_una;
492 	int err;
493 	struct net *net = dev_net(skb->dev);
494 
495 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
496 				       iph->daddr, th->dest, iph->saddr,
497 				       ntohs(th->source), inet_iif(skb), 0);
498 	if (!sk) {
499 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
500 		return -ENOENT;
501 	}
502 	if (sk->sk_state == TCP_TIME_WAIT) {
503 		/* To increase the counter of ignored icmps for TCP-AO */
504 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
505 		inet_twsk_put(inet_twsk(sk));
506 		return 0;
507 	}
508 	seq = ntohl(th->seq);
509 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
510 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
511 				     type == ICMP_TIME_EXCEEDED ||
512 				     (type == ICMP_DEST_UNREACH &&
513 				      (code == ICMP_NET_UNREACH ||
514 				       code == ICMP_HOST_UNREACH)));
515 		return 0;
516 	}
517 
518 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
519 		sock_put(sk);
520 		return 0;
521 	}
522 
523 	bh_lock_sock(sk);
524 	/* If too many ICMPs get dropped on busy
525 	 * servers this needs to be solved differently.
526 	 * We do take care of PMTU discovery (RFC1191) special case :
527 	 * we can receive locally generated ICMP messages while socket is held.
528 	 */
529 	if (sock_owned_by_user(sk)) {
530 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
531 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
532 	}
533 	if (sk->sk_state == TCP_CLOSE)
534 		goto out;
535 
536 	if (static_branch_unlikely(&ip4_min_ttl)) {
537 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
538 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
539 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
540 			goto out;
541 		}
542 	}
543 
544 	tp = tcp_sk(sk);
545 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
546 	fastopen = rcu_dereference(tp->fastopen_rsk);
547 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
548 	if (sk->sk_state != TCP_LISTEN &&
549 	    !between(seq, snd_una, tp->snd_nxt)) {
550 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
551 		goto out;
552 	}
553 
554 	switch (type) {
555 	case ICMP_REDIRECT:
556 		if (!sock_owned_by_user(sk))
557 			do_redirect(skb, sk);
558 		goto out;
559 	case ICMP_SOURCE_QUENCH:
560 		/* Just silently ignore these. */
561 		goto out;
562 	case ICMP_PARAMETERPROB:
563 		err = EPROTO;
564 		break;
565 	case ICMP_DEST_UNREACH:
566 		if (code > NR_ICMP_UNREACH)
567 			goto out;
568 
569 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
570 			/* We are not interested in TCP_LISTEN and open_requests
571 			 * (SYN-ACKs send out by Linux are always <576bytes so
572 			 * they should go through unfragmented).
573 			 */
574 			if (sk->sk_state == TCP_LISTEN)
575 				goto out;
576 
577 			WRITE_ONCE(tp->mtu_info, info);
578 			if (!sock_owned_by_user(sk)) {
579 				tcp_v4_mtu_reduced(sk);
580 			} else {
581 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
582 					sock_hold(sk);
583 			}
584 			goto out;
585 		}
586 
587 		err = icmp_err_convert[code].errno;
588 		/* check if this ICMP message allows revert of backoff.
589 		 * (see RFC 6069)
590 		 */
591 		if (!fastopen &&
592 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
593 			tcp_ld_RTO_revert(sk, seq);
594 		break;
595 	case ICMP_TIME_EXCEEDED:
596 		err = EHOSTUNREACH;
597 		break;
598 	default:
599 		goto out;
600 	}
601 
602 	switch (sk->sk_state) {
603 	case TCP_SYN_SENT:
604 	case TCP_SYN_RECV:
605 		/* Only in fast or simultaneous open. If a fast open socket is
606 		 * already accepted it is treated as a connected one below.
607 		 */
608 		if (fastopen && !fastopen->sk)
609 			break;
610 
611 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
612 
613 		if (!sock_owned_by_user(sk)) {
614 			WRITE_ONCE(sk->sk_err, err);
615 
616 			sk_error_report(sk);
617 
618 			tcp_done(sk);
619 		} else {
620 			WRITE_ONCE(sk->sk_err_soft, err);
621 		}
622 		goto out;
623 	}
624 
625 	/* If we've already connected we will keep trying
626 	 * until we time out, or the user gives up.
627 	 *
628 	 * rfc1122 4.2.3.9 allows to consider as hard errors
629 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
630 	 * but it is obsoleted by pmtu discovery).
631 	 *
632 	 * Note, that in modern internet, where routing is unreliable
633 	 * and in each dark corner broken firewalls sit, sending random
634 	 * errors ordered by their masters even this two messages finally lose
635 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
636 	 *
637 	 * Now we are in compliance with RFCs.
638 	 *							--ANK (980905)
639 	 */
640 
641 	if (!sock_owned_by_user(sk) &&
642 	    inet_test_bit(RECVERR, sk)) {
643 		WRITE_ONCE(sk->sk_err, err);
644 		sk_error_report(sk);
645 	} else	{ /* Only an error on timeout */
646 		WRITE_ONCE(sk->sk_err_soft, err);
647 	}
648 
649 out:
650 	bh_unlock_sock(sk);
651 	sock_put(sk);
652 	return 0;
653 }
654 
655 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
656 {
657 	struct tcphdr *th = tcp_hdr(skb);
658 
659 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
660 	skb->csum_start = skb_transport_header(skb) - skb->head;
661 	skb->csum_offset = offsetof(struct tcphdr, check);
662 }
663 
664 /* This routine computes an IPv4 TCP checksum. */
665 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
666 {
667 	const struct inet_sock *inet = inet_sk(sk);
668 
669 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
670 }
671 EXPORT_SYMBOL(tcp_v4_send_check);
672 
673 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
674 
675 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
676 				 const struct tcp_ao_hdr *aoh,
677 				 struct ip_reply_arg *arg, struct tcphdr *reply,
678 				 __be32 reply_options[REPLY_OPTIONS_LEN])
679 {
680 #ifdef CONFIG_TCP_AO
681 	int sdif = tcp_v4_sdif(skb);
682 	int dif = inet_iif(skb);
683 	int l3index = sdif ? dif : 0;
684 	bool allocated_traffic_key;
685 	struct tcp_ao_key *key;
686 	char *traffic_key;
687 	bool drop = true;
688 	u32 ao_sne = 0;
689 	u8 keyid;
690 
691 	rcu_read_lock();
692 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
693 				 &key, &traffic_key, &allocated_traffic_key,
694 				 &keyid, &ao_sne))
695 		goto out;
696 
697 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
698 				 (aoh->rnext_keyid << 8) | keyid);
699 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
700 	reply->doff = arg->iov[0].iov_len / 4;
701 
702 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
703 			    key, traffic_key,
704 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
705 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
706 			    reply, ao_sne))
707 		goto out;
708 	drop = false;
709 out:
710 	rcu_read_unlock();
711 	if (allocated_traffic_key)
712 		kfree(traffic_key);
713 	return drop;
714 #else
715 	return true;
716 #endif
717 }
718 
719 /*
720  *	This routine will send an RST to the other tcp.
721  *
722  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
723  *		      for reset.
724  *	Answer: if a packet caused RST, it is not for a socket
725  *		existing in our system, if it is matched to a socket,
726  *		it is just duplicate segment or bug in other side's TCP.
727  *		So that we build reply only basing on parameters
728  *		arrived with segment.
729  *	Exception: precedence violation. We do not implement it in any case.
730  */
731 
732 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
733 {
734 	const struct tcphdr *th = tcp_hdr(skb);
735 	struct {
736 		struct tcphdr th;
737 		__be32 opt[REPLY_OPTIONS_LEN];
738 	} rep;
739 	const __u8 *md5_hash_location = NULL;
740 	const struct tcp_ao_hdr *aoh;
741 	struct ip_reply_arg arg;
742 #ifdef CONFIG_TCP_MD5SIG
743 	struct tcp_md5sig_key *key = NULL;
744 	unsigned char newhash[16];
745 	struct sock *sk1 = NULL;
746 	int genhash;
747 #endif
748 	u64 transmit_time = 0;
749 	struct sock *ctl_sk;
750 	struct net *net;
751 	u32 txhash = 0;
752 
753 	/* Never send a reset in response to a reset. */
754 	if (th->rst)
755 		return;
756 
757 	/* If sk not NULL, it means we did a successful lookup and incoming
758 	 * route had to be correct. prequeue might have dropped our dst.
759 	 */
760 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
761 		return;
762 
763 	/* Swap the send and the receive. */
764 	memset(&rep, 0, sizeof(rep));
765 	rep.th.dest   = th->source;
766 	rep.th.source = th->dest;
767 	rep.th.doff   = sizeof(struct tcphdr) / 4;
768 	rep.th.rst    = 1;
769 
770 	if (th->ack) {
771 		rep.th.seq = th->ack_seq;
772 	} else {
773 		rep.th.ack = 1;
774 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
775 				       skb->len - (th->doff << 2));
776 	}
777 
778 	memset(&arg, 0, sizeof(arg));
779 	arg.iov[0].iov_base = (unsigned char *)&rep;
780 	arg.iov[0].iov_len  = sizeof(rep.th);
781 
782 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
783 
784 	/* Invalid TCP option size or twice included auth */
785 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
786 		return;
787 
788 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
789 		return;
790 
791 #ifdef CONFIG_TCP_MD5SIG
792 	rcu_read_lock();
793 	if (sk && sk_fullsock(sk)) {
794 		const union tcp_md5_addr *addr;
795 		int l3index;
796 
797 		/* sdif set, means packet ingressed via a device
798 		 * in an L3 domain and inet_iif is set to it.
799 		 */
800 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
801 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
802 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
803 	} else if (md5_hash_location) {
804 		const union tcp_md5_addr *addr;
805 		int sdif = tcp_v4_sdif(skb);
806 		int dif = inet_iif(skb);
807 		int l3index;
808 
809 		/*
810 		 * active side is lost. Try to find listening socket through
811 		 * source port, and then find md5 key through listening socket.
812 		 * we are not loose security here:
813 		 * Incoming packet is checked with md5 hash with finding key,
814 		 * no RST generated if md5 hash doesn't match.
815 		 */
816 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
817 					     NULL, 0, ip_hdr(skb)->saddr,
818 					     th->source, ip_hdr(skb)->daddr,
819 					     ntohs(th->source), dif, sdif);
820 		/* don't send rst if it can't find key */
821 		if (!sk1)
822 			goto out;
823 
824 		/* sdif set, means packet ingressed via a device
825 		 * in an L3 domain and dif is set to it.
826 		 */
827 		l3index = sdif ? dif : 0;
828 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
829 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
830 		if (!key)
831 			goto out;
832 
833 
834 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
835 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
836 			goto out;
837 
838 	}
839 
840 	if (key) {
841 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
842 				   (TCPOPT_NOP << 16) |
843 				   (TCPOPT_MD5SIG << 8) |
844 				   TCPOLEN_MD5SIG);
845 		/* Update length and the length the header thinks exists */
846 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
847 		rep.th.doff = arg.iov[0].iov_len / 4;
848 
849 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
850 				     key, ip_hdr(skb)->saddr,
851 				     ip_hdr(skb)->daddr, &rep.th);
852 	}
853 #endif
854 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
855 	if (rep.opt[0] == 0) {
856 		__be32 mrst = mptcp_reset_option(skb);
857 
858 		if (mrst) {
859 			rep.opt[0] = mrst;
860 			arg.iov[0].iov_len += sizeof(mrst);
861 			rep.th.doff = arg.iov[0].iov_len / 4;
862 		}
863 	}
864 
865 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
866 				      ip_hdr(skb)->saddr, /* XXX */
867 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
868 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
869 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
870 
871 	/* When socket is gone, all binding information is lost.
872 	 * routing might fail in this case. No choice here, if we choose to force
873 	 * input interface, we will misroute in case of asymmetric route.
874 	 */
875 	if (sk) {
876 		arg.bound_dev_if = sk->sk_bound_dev_if;
877 		if (sk_fullsock(sk))
878 			trace_tcp_send_reset(sk, skb);
879 	}
880 
881 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
882 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
883 
884 	arg.tos = ip_hdr(skb)->tos;
885 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
886 	local_bh_disable();
887 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
888 	sock_net_set(ctl_sk, net);
889 	if (sk) {
890 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
891 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
892 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
893 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
894 		transmit_time = tcp_transmit_time(sk);
895 		xfrm_sk_clone_policy(ctl_sk, sk);
896 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
897 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
898 	} else {
899 		ctl_sk->sk_mark = 0;
900 		ctl_sk->sk_priority = 0;
901 	}
902 	ip_send_unicast_reply(ctl_sk,
903 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
904 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
905 			      &arg, arg.iov[0].iov_len,
906 			      transmit_time, txhash);
907 
908 	xfrm_sk_free_policy(ctl_sk);
909 	sock_net_set(ctl_sk, &init_net);
910 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
911 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
912 	local_bh_enable();
913 
914 #ifdef CONFIG_TCP_MD5SIG
915 out:
916 	rcu_read_unlock();
917 #endif
918 }
919 
920 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
921    outside socket context is ugly, certainly. What can I do?
922  */
923 
924 static void tcp_v4_send_ack(const struct sock *sk,
925 			    struct sk_buff *skb, u32 seq, u32 ack,
926 			    u32 win, u32 tsval, u32 tsecr, int oif,
927 			    struct tcp_key *key,
928 			    int reply_flags, u8 tos, u32 txhash)
929 {
930 	const struct tcphdr *th = tcp_hdr(skb);
931 	struct {
932 		struct tcphdr th;
933 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
934 	} rep;
935 	struct net *net = sock_net(sk);
936 	struct ip_reply_arg arg;
937 	struct sock *ctl_sk;
938 	u64 transmit_time;
939 
940 	memset(&rep.th, 0, sizeof(struct tcphdr));
941 	memset(&arg, 0, sizeof(arg));
942 
943 	arg.iov[0].iov_base = (unsigned char *)&rep;
944 	arg.iov[0].iov_len  = sizeof(rep.th);
945 	if (tsecr) {
946 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
947 				   (TCPOPT_TIMESTAMP << 8) |
948 				   TCPOLEN_TIMESTAMP);
949 		rep.opt[1] = htonl(tsval);
950 		rep.opt[2] = htonl(tsecr);
951 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
952 	}
953 
954 	/* Swap the send and the receive. */
955 	rep.th.dest    = th->source;
956 	rep.th.source  = th->dest;
957 	rep.th.doff    = arg.iov[0].iov_len / 4;
958 	rep.th.seq     = htonl(seq);
959 	rep.th.ack_seq = htonl(ack);
960 	rep.th.ack     = 1;
961 	rep.th.window  = htons(win);
962 
963 #ifdef CONFIG_TCP_MD5SIG
964 	if (tcp_key_is_md5(key)) {
965 		int offset = (tsecr) ? 3 : 0;
966 
967 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
968 					  (TCPOPT_NOP << 16) |
969 					  (TCPOPT_MD5SIG << 8) |
970 					  TCPOLEN_MD5SIG);
971 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
972 		rep.th.doff = arg.iov[0].iov_len/4;
973 
974 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
975 				    key->md5_key, ip_hdr(skb)->saddr,
976 				    ip_hdr(skb)->daddr, &rep.th);
977 	}
978 #endif
979 #ifdef CONFIG_TCP_AO
980 	if (tcp_key_is_ao(key)) {
981 		int offset = (tsecr) ? 3 : 0;
982 
983 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
984 					  (tcp_ao_len(key->ao_key) << 16) |
985 					  (key->ao_key->sndid << 8) |
986 					  key->rcv_next);
987 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
988 		rep.th.doff = arg.iov[0].iov_len / 4;
989 
990 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
991 				key->ao_key, key->traffic_key,
992 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
993 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
994 				&rep.th, key->sne);
995 	}
996 #endif
997 	arg.flags = reply_flags;
998 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
999 				      ip_hdr(skb)->saddr, /* XXX */
1000 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1001 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1002 	if (oif)
1003 		arg.bound_dev_if = oif;
1004 	arg.tos = tos;
1005 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1006 	local_bh_disable();
1007 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
1008 	sock_net_set(ctl_sk, net);
1009 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1010 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1011 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1012 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1013 	transmit_time = tcp_transmit_time(sk);
1014 	ip_send_unicast_reply(ctl_sk,
1015 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1016 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1017 			      &arg, arg.iov[0].iov_len,
1018 			      transmit_time, txhash);
1019 
1020 	sock_net_set(ctl_sk, &init_net);
1021 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1022 	local_bh_enable();
1023 }
1024 
1025 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1026 {
1027 	struct inet_timewait_sock *tw = inet_twsk(sk);
1028 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1029 	struct tcp_key key = {};
1030 #ifdef CONFIG_TCP_AO
1031 	struct tcp_ao_info *ao_info;
1032 
1033 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1034 		/* FIXME: the segment to-be-acked is not verified yet */
1035 		ao_info = rcu_dereference(tcptw->ao_info);
1036 		if (ao_info) {
1037 			const struct tcp_ao_hdr *aoh;
1038 
1039 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1040 				inet_twsk_put(tw);
1041 				return;
1042 			}
1043 
1044 			if (aoh)
1045 				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1046 		}
1047 	}
1048 	if (key.ao_key) {
1049 		struct tcp_ao_key *rnext_key;
1050 
1051 		key.traffic_key = snd_other_key(key.ao_key);
1052 		key.sne = READ_ONCE(ao_info->snd_sne);
1053 		rnext_key = READ_ONCE(ao_info->rnext_key);
1054 		key.rcv_next = rnext_key->rcvid;
1055 		key.type = TCP_KEY_AO;
1056 #else
1057 	if (0) {
1058 #endif
1059 #ifdef CONFIG_TCP_MD5SIG
1060 	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1061 		key.md5_key = tcp_twsk_md5_key(tcptw);
1062 		if (key.md5_key)
1063 			key.type = TCP_KEY_MD5;
1064 #endif
1065 	}
1066 
1067 	tcp_v4_send_ack(sk, skb,
1068 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1069 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1070 			tcp_tw_tsval(tcptw),
1071 			tcptw->tw_ts_recent,
1072 			tw->tw_bound_dev_if, &key,
1073 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1074 			tw->tw_tos,
1075 			tw->tw_txhash);
1076 
1077 	inet_twsk_put(tw);
1078 }
1079 
1080 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1081 				  struct request_sock *req)
1082 {
1083 	struct tcp_key key = {};
1084 
1085 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1086 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1087 	 */
1088 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1089 					     tcp_sk(sk)->snd_nxt;
1090 
1091 #ifdef CONFIG_TCP_AO
1092 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1093 	    tcp_rsk_used_ao(req)) {
1094 		const union tcp_md5_addr *addr;
1095 		const struct tcp_ao_hdr *aoh;
1096 		int l3index;
1097 
1098 		/* Invalid TCP option size or twice included auth */
1099 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1100 			return;
1101 		if (!aoh)
1102 			return;
1103 
1104 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1105 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1106 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1107 					      aoh->rnext_keyid, -1);
1108 		if (unlikely(!key.ao_key)) {
1109 			/* Send ACK with any matching MKT for the peer */
1110 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1111 			/* Matching key disappeared (user removed the key?)
1112 			 * let the handshake timeout.
1113 			 */
1114 			if (!key.ao_key) {
1115 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1116 						     addr,
1117 						     ntohs(tcp_hdr(skb)->source),
1118 						     &ip_hdr(skb)->daddr,
1119 						     ntohs(tcp_hdr(skb)->dest));
1120 				return;
1121 			}
1122 		}
1123 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1124 		if (!key.traffic_key)
1125 			return;
1126 
1127 		key.type = TCP_KEY_AO;
1128 		key.rcv_next = aoh->keyid;
1129 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1130 #else
1131 	if (0) {
1132 #endif
1133 #ifdef CONFIG_TCP_MD5SIG
1134 	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1135 		const union tcp_md5_addr *addr;
1136 		int l3index;
1137 
1138 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1139 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1140 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1141 		if (key.md5_key)
1142 			key.type = TCP_KEY_MD5;
1143 #endif
1144 	}
1145 
1146 	/* RFC 7323 2.3
1147 	 * The window field (SEG.WND) of every outgoing segment, with the
1148 	 * exception of <SYN> segments, MUST be right-shifted by
1149 	 * Rcv.Wind.Shift bits:
1150 	 */
1151 	tcp_v4_send_ack(sk, skb, seq,
1152 			tcp_rsk(req)->rcv_nxt,
1153 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
1154 			tcp_rsk_tsval(tcp_rsk(req)),
1155 			READ_ONCE(req->ts_recent),
1156 			0, &key,
1157 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1158 			ip_hdr(skb)->tos,
1159 			READ_ONCE(tcp_rsk(req)->txhash));
1160 	if (tcp_key_is_ao(&key))
1161 		kfree(key.traffic_key);
1162 }
1163 
1164 /*
1165  *	Send a SYN-ACK after having received a SYN.
1166  *	This still operates on a request_sock only, not on a big
1167  *	socket.
1168  */
1169 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1170 			      struct flowi *fl,
1171 			      struct request_sock *req,
1172 			      struct tcp_fastopen_cookie *foc,
1173 			      enum tcp_synack_type synack_type,
1174 			      struct sk_buff *syn_skb)
1175 {
1176 	const struct inet_request_sock *ireq = inet_rsk(req);
1177 	struct flowi4 fl4;
1178 	int err = -1;
1179 	struct sk_buff *skb;
1180 	u8 tos;
1181 
1182 	/* First, grab a route. */
1183 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1184 		return -1;
1185 
1186 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1187 
1188 	if (skb) {
1189 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1190 
1191 		tos = READ_ONCE(inet_sk(sk)->tos);
1192 
1193 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1194 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1195 			      (tos & INET_ECN_MASK);
1196 
1197 		if (!INET_ECN_is_capable(tos) &&
1198 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1199 			tos |= INET_ECN_ECT_0;
1200 
1201 		rcu_read_lock();
1202 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1203 					    ireq->ir_rmt_addr,
1204 					    rcu_dereference(ireq->ireq_opt),
1205 					    tos);
1206 		rcu_read_unlock();
1207 		err = net_xmit_eval(err);
1208 	}
1209 
1210 	return err;
1211 }
1212 
1213 /*
1214  *	IPv4 request_sock destructor.
1215  */
1216 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1217 {
1218 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1219 }
1220 
1221 #ifdef CONFIG_TCP_MD5SIG
1222 /*
1223  * RFC2385 MD5 checksumming requires a mapping of
1224  * IP address->MD5 Key.
1225  * We need to maintain these in the sk structure.
1226  */
1227 
1228 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1229 EXPORT_SYMBOL(tcp_md5_needed);
1230 
1231 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1232 {
1233 	if (!old)
1234 		return true;
1235 
1236 	/* l3index always overrides non-l3index */
1237 	if (old->l3index && new->l3index == 0)
1238 		return false;
1239 	if (old->l3index == 0 && new->l3index)
1240 		return true;
1241 
1242 	return old->prefixlen < new->prefixlen;
1243 }
1244 
1245 /* Find the Key structure for an address.  */
1246 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1247 					   const union tcp_md5_addr *addr,
1248 					   int family, bool any_l3index)
1249 {
1250 	const struct tcp_sock *tp = tcp_sk(sk);
1251 	struct tcp_md5sig_key *key;
1252 	const struct tcp_md5sig_info *md5sig;
1253 	__be32 mask;
1254 	struct tcp_md5sig_key *best_match = NULL;
1255 	bool match;
1256 
1257 	/* caller either holds rcu_read_lock() or socket lock */
1258 	md5sig = rcu_dereference_check(tp->md5sig_info,
1259 				       lockdep_sock_is_held(sk));
1260 	if (!md5sig)
1261 		return NULL;
1262 
1263 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1264 				 lockdep_sock_is_held(sk)) {
1265 		if (key->family != family)
1266 			continue;
1267 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1268 		    key->l3index != l3index)
1269 			continue;
1270 		if (family == AF_INET) {
1271 			mask = inet_make_mask(key->prefixlen);
1272 			match = (key->addr.a4.s_addr & mask) ==
1273 				(addr->a4.s_addr & mask);
1274 #if IS_ENABLED(CONFIG_IPV6)
1275 		} else if (family == AF_INET6) {
1276 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1277 						  key->prefixlen);
1278 #endif
1279 		} else {
1280 			match = false;
1281 		}
1282 
1283 		if (match && better_md5_match(best_match, key))
1284 			best_match = key;
1285 	}
1286 	return best_match;
1287 }
1288 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1289 
1290 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1291 						      const union tcp_md5_addr *addr,
1292 						      int family, u8 prefixlen,
1293 						      int l3index, u8 flags)
1294 {
1295 	const struct tcp_sock *tp = tcp_sk(sk);
1296 	struct tcp_md5sig_key *key;
1297 	unsigned int size = sizeof(struct in_addr);
1298 	const struct tcp_md5sig_info *md5sig;
1299 
1300 	/* caller either holds rcu_read_lock() or socket lock */
1301 	md5sig = rcu_dereference_check(tp->md5sig_info,
1302 				       lockdep_sock_is_held(sk));
1303 	if (!md5sig)
1304 		return NULL;
1305 #if IS_ENABLED(CONFIG_IPV6)
1306 	if (family == AF_INET6)
1307 		size = sizeof(struct in6_addr);
1308 #endif
1309 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1310 				 lockdep_sock_is_held(sk)) {
1311 		if (key->family != family)
1312 			continue;
1313 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1314 			continue;
1315 		if (key->l3index != l3index)
1316 			continue;
1317 		if (!memcmp(&key->addr, addr, size) &&
1318 		    key->prefixlen == prefixlen)
1319 			return key;
1320 	}
1321 	return NULL;
1322 }
1323 
1324 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1325 					 const struct sock *addr_sk)
1326 {
1327 	const union tcp_md5_addr *addr;
1328 	int l3index;
1329 
1330 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1331 						 addr_sk->sk_bound_dev_if);
1332 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1333 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1334 }
1335 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1336 
1337 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1338 {
1339 	struct tcp_sock *tp = tcp_sk(sk);
1340 	struct tcp_md5sig_info *md5sig;
1341 
1342 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1343 	if (!md5sig)
1344 		return -ENOMEM;
1345 
1346 	sk_gso_disable(sk);
1347 	INIT_HLIST_HEAD(&md5sig->head);
1348 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1349 	return 0;
1350 }
1351 
1352 /* This can be called on a newly created socket, from other files */
1353 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1354 			    int family, u8 prefixlen, int l3index, u8 flags,
1355 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1356 {
1357 	/* Add Key to the list */
1358 	struct tcp_md5sig_key *key;
1359 	struct tcp_sock *tp = tcp_sk(sk);
1360 	struct tcp_md5sig_info *md5sig;
1361 
1362 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1363 	if (key) {
1364 		/* Pre-existing entry - just update that one.
1365 		 * Note that the key might be used concurrently.
1366 		 * data_race() is telling kcsan that we do not care of
1367 		 * key mismatches, since changing MD5 key on live flows
1368 		 * can lead to packet drops.
1369 		 */
1370 		data_race(memcpy(key->key, newkey, newkeylen));
1371 
1372 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1373 		 * Also note that a reader could catch new key->keylen value
1374 		 * but old key->key[], this is the reason we use __GFP_ZERO
1375 		 * at sock_kmalloc() time below these lines.
1376 		 */
1377 		WRITE_ONCE(key->keylen, newkeylen);
1378 
1379 		return 0;
1380 	}
1381 
1382 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1383 					   lockdep_sock_is_held(sk));
1384 
1385 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1386 	if (!key)
1387 		return -ENOMEM;
1388 
1389 	memcpy(key->key, newkey, newkeylen);
1390 	key->keylen = newkeylen;
1391 	key->family = family;
1392 	key->prefixlen = prefixlen;
1393 	key->l3index = l3index;
1394 	key->flags = flags;
1395 	memcpy(&key->addr, addr,
1396 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1397 								 sizeof(struct in_addr));
1398 	hlist_add_head_rcu(&key->node, &md5sig->head);
1399 	return 0;
1400 }
1401 
1402 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1403 		   int family, u8 prefixlen, int l3index, u8 flags,
1404 		   const u8 *newkey, u8 newkeylen)
1405 {
1406 	struct tcp_sock *tp = tcp_sk(sk);
1407 
1408 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1409 		if (tcp_md5_alloc_sigpool())
1410 			return -ENOMEM;
1411 
1412 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1413 			tcp_md5_release_sigpool();
1414 			return -ENOMEM;
1415 		}
1416 
1417 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1418 			struct tcp_md5sig_info *md5sig;
1419 
1420 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1421 			rcu_assign_pointer(tp->md5sig_info, NULL);
1422 			kfree_rcu(md5sig, rcu);
1423 			tcp_md5_release_sigpool();
1424 			return -EUSERS;
1425 		}
1426 	}
1427 
1428 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1429 				newkey, newkeylen, GFP_KERNEL);
1430 }
1431 EXPORT_SYMBOL(tcp_md5_do_add);
1432 
1433 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1434 		     int family, u8 prefixlen, int l3index,
1435 		     struct tcp_md5sig_key *key)
1436 {
1437 	struct tcp_sock *tp = tcp_sk(sk);
1438 
1439 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1440 		tcp_md5_add_sigpool();
1441 
1442 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1443 			tcp_md5_release_sigpool();
1444 			return -ENOMEM;
1445 		}
1446 
1447 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1448 			struct tcp_md5sig_info *md5sig;
1449 
1450 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1451 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1452 			rcu_assign_pointer(tp->md5sig_info, NULL);
1453 			kfree_rcu(md5sig, rcu);
1454 			tcp_md5_release_sigpool();
1455 			return -EUSERS;
1456 		}
1457 	}
1458 
1459 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1460 				key->flags, key->key, key->keylen,
1461 				sk_gfp_mask(sk, GFP_ATOMIC));
1462 }
1463 EXPORT_SYMBOL(tcp_md5_key_copy);
1464 
1465 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1466 		   u8 prefixlen, int l3index, u8 flags)
1467 {
1468 	struct tcp_md5sig_key *key;
1469 
1470 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1471 	if (!key)
1472 		return -ENOENT;
1473 	hlist_del_rcu(&key->node);
1474 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1475 	kfree_rcu(key, rcu);
1476 	return 0;
1477 }
1478 EXPORT_SYMBOL(tcp_md5_do_del);
1479 
1480 void tcp_clear_md5_list(struct sock *sk)
1481 {
1482 	struct tcp_sock *tp = tcp_sk(sk);
1483 	struct tcp_md5sig_key *key;
1484 	struct hlist_node *n;
1485 	struct tcp_md5sig_info *md5sig;
1486 
1487 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1488 
1489 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1490 		hlist_del_rcu(&key->node);
1491 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1492 		kfree_rcu(key, rcu);
1493 	}
1494 }
1495 
1496 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1497 				 sockptr_t optval, int optlen)
1498 {
1499 	struct tcp_md5sig cmd;
1500 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1501 	const union tcp_md5_addr *addr;
1502 	u8 prefixlen = 32;
1503 	int l3index = 0;
1504 	bool l3flag;
1505 	u8 flags;
1506 
1507 	if (optlen < sizeof(cmd))
1508 		return -EINVAL;
1509 
1510 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1511 		return -EFAULT;
1512 
1513 	if (sin->sin_family != AF_INET)
1514 		return -EINVAL;
1515 
1516 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1517 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1518 
1519 	if (optname == TCP_MD5SIG_EXT &&
1520 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1521 		prefixlen = cmd.tcpm_prefixlen;
1522 		if (prefixlen > 32)
1523 			return -EINVAL;
1524 	}
1525 
1526 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1527 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1528 		struct net_device *dev;
1529 
1530 		rcu_read_lock();
1531 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1532 		if (dev && netif_is_l3_master(dev))
1533 			l3index = dev->ifindex;
1534 
1535 		rcu_read_unlock();
1536 
1537 		/* ok to reference set/not set outside of rcu;
1538 		 * right now device MUST be an L3 master
1539 		 */
1540 		if (!dev || !l3index)
1541 			return -EINVAL;
1542 	}
1543 
1544 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1545 
1546 	if (!cmd.tcpm_keylen)
1547 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1548 
1549 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1550 		return -EINVAL;
1551 
1552 	/* Don't allow keys for peers that have a matching TCP-AO key.
1553 	 * See the comment in tcp_ao_add_cmd()
1554 	 */
1555 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1556 		return -EKEYREJECTED;
1557 
1558 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1559 			      cmd.tcpm_key, cmd.tcpm_keylen);
1560 }
1561 
1562 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1563 				   __be32 daddr, __be32 saddr,
1564 				   const struct tcphdr *th, int nbytes)
1565 {
1566 	struct tcp4_pseudohdr *bp;
1567 	struct scatterlist sg;
1568 	struct tcphdr *_th;
1569 
1570 	bp = hp->scratch;
1571 	bp->saddr = saddr;
1572 	bp->daddr = daddr;
1573 	bp->pad = 0;
1574 	bp->protocol = IPPROTO_TCP;
1575 	bp->len = cpu_to_be16(nbytes);
1576 
1577 	_th = (struct tcphdr *)(bp + 1);
1578 	memcpy(_th, th, sizeof(*th));
1579 	_th->check = 0;
1580 
1581 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1582 	ahash_request_set_crypt(hp->req, &sg, NULL,
1583 				sizeof(*bp) + sizeof(*th));
1584 	return crypto_ahash_update(hp->req);
1585 }
1586 
1587 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1588 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1589 {
1590 	struct tcp_sigpool hp;
1591 
1592 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1593 		goto clear_hash_nostart;
1594 
1595 	if (crypto_ahash_init(hp.req))
1596 		goto clear_hash;
1597 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1598 		goto clear_hash;
1599 	if (tcp_md5_hash_key(&hp, key))
1600 		goto clear_hash;
1601 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1602 	if (crypto_ahash_final(hp.req))
1603 		goto clear_hash;
1604 
1605 	tcp_sigpool_end(&hp);
1606 	return 0;
1607 
1608 clear_hash:
1609 	tcp_sigpool_end(&hp);
1610 clear_hash_nostart:
1611 	memset(md5_hash, 0, 16);
1612 	return 1;
1613 }
1614 
1615 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1616 			const struct sock *sk,
1617 			const struct sk_buff *skb)
1618 {
1619 	const struct tcphdr *th = tcp_hdr(skb);
1620 	struct tcp_sigpool hp;
1621 	__be32 saddr, daddr;
1622 
1623 	if (sk) { /* valid for establish/request sockets */
1624 		saddr = sk->sk_rcv_saddr;
1625 		daddr = sk->sk_daddr;
1626 	} else {
1627 		const struct iphdr *iph = ip_hdr(skb);
1628 		saddr = iph->saddr;
1629 		daddr = iph->daddr;
1630 	}
1631 
1632 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1633 		goto clear_hash_nostart;
1634 
1635 	if (crypto_ahash_init(hp.req))
1636 		goto clear_hash;
1637 
1638 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1639 		goto clear_hash;
1640 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1641 		goto clear_hash;
1642 	if (tcp_md5_hash_key(&hp, key))
1643 		goto clear_hash;
1644 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1645 	if (crypto_ahash_final(hp.req))
1646 		goto clear_hash;
1647 
1648 	tcp_sigpool_end(&hp);
1649 	return 0;
1650 
1651 clear_hash:
1652 	tcp_sigpool_end(&hp);
1653 clear_hash_nostart:
1654 	memset(md5_hash, 0, 16);
1655 	return 1;
1656 }
1657 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1658 
1659 #endif
1660 
1661 static void tcp_v4_init_req(struct request_sock *req,
1662 			    const struct sock *sk_listener,
1663 			    struct sk_buff *skb)
1664 {
1665 	struct inet_request_sock *ireq = inet_rsk(req);
1666 	struct net *net = sock_net(sk_listener);
1667 
1668 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1669 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1670 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1671 }
1672 
1673 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1674 					  struct sk_buff *skb,
1675 					  struct flowi *fl,
1676 					  struct request_sock *req)
1677 {
1678 	tcp_v4_init_req(req, sk, skb);
1679 
1680 	if (security_inet_conn_request(sk, skb, req))
1681 		return NULL;
1682 
1683 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1684 }
1685 
1686 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1687 	.family		=	PF_INET,
1688 	.obj_size	=	sizeof(struct tcp_request_sock),
1689 	.rtx_syn_ack	=	tcp_rtx_synack,
1690 	.send_ack	=	tcp_v4_reqsk_send_ack,
1691 	.destructor	=	tcp_v4_reqsk_destructor,
1692 	.send_reset	=	tcp_v4_send_reset,
1693 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1694 };
1695 
1696 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1697 	.mss_clamp	=	TCP_MSS_DEFAULT,
1698 #ifdef CONFIG_TCP_MD5SIG
1699 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1700 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1701 #endif
1702 #ifdef CONFIG_TCP_AO
1703 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1704 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1705 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1706 #endif
1707 #ifdef CONFIG_SYN_COOKIES
1708 	.cookie_init_seq =	cookie_v4_init_sequence,
1709 #endif
1710 	.route_req	=	tcp_v4_route_req,
1711 	.init_seq	=	tcp_v4_init_seq,
1712 	.init_ts_off	=	tcp_v4_init_ts_off,
1713 	.send_synack	=	tcp_v4_send_synack,
1714 };
1715 
1716 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1717 {
1718 	/* Never answer to SYNs send to broadcast or multicast */
1719 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1720 		goto drop;
1721 
1722 	return tcp_conn_request(&tcp_request_sock_ops,
1723 				&tcp_request_sock_ipv4_ops, sk, skb);
1724 
1725 drop:
1726 	tcp_listendrop(sk);
1727 	return 0;
1728 }
1729 EXPORT_SYMBOL(tcp_v4_conn_request);
1730 
1731 
1732 /*
1733  * The three way handshake has completed - we got a valid synack -
1734  * now create the new socket.
1735  */
1736 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1737 				  struct request_sock *req,
1738 				  struct dst_entry *dst,
1739 				  struct request_sock *req_unhash,
1740 				  bool *own_req)
1741 {
1742 	struct inet_request_sock *ireq;
1743 	bool found_dup_sk = false;
1744 	struct inet_sock *newinet;
1745 	struct tcp_sock *newtp;
1746 	struct sock *newsk;
1747 #ifdef CONFIG_TCP_MD5SIG
1748 	const union tcp_md5_addr *addr;
1749 	struct tcp_md5sig_key *key;
1750 	int l3index;
1751 #endif
1752 	struct ip_options_rcu *inet_opt;
1753 
1754 	if (sk_acceptq_is_full(sk))
1755 		goto exit_overflow;
1756 
1757 	newsk = tcp_create_openreq_child(sk, req, skb);
1758 	if (!newsk)
1759 		goto exit_nonewsk;
1760 
1761 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1762 	inet_sk_rx_dst_set(newsk, skb);
1763 
1764 	newtp		      = tcp_sk(newsk);
1765 	newinet		      = inet_sk(newsk);
1766 	ireq		      = inet_rsk(req);
1767 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1768 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1769 	newsk->sk_bound_dev_if = ireq->ir_iif;
1770 	newinet->inet_saddr   = ireq->ir_loc_addr;
1771 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1772 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1773 	newinet->mc_index     = inet_iif(skb);
1774 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1775 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1776 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1777 	if (inet_opt)
1778 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1779 	atomic_set(&newinet->inet_id, get_random_u16());
1780 
1781 	/* Set ToS of the new socket based upon the value of incoming SYN.
1782 	 * ECT bits are set later in tcp_init_transfer().
1783 	 */
1784 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1785 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1786 
1787 	if (!dst) {
1788 		dst = inet_csk_route_child_sock(sk, newsk, req);
1789 		if (!dst)
1790 			goto put_and_exit;
1791 	} else {
1792 		/* syncookie case : see end of cookie_v4_check() */
1793 	}
1794 	sk_setup_caps(newsk, dst);
1795 
1796 	tcp_ca_openreq_child(newsk, dst);
1797 
1798 	tcp_sync_mss(newsk, dst_mtu(dst));
1799 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1800 
1801 	tcp_initialize_rcv_mss(newsk);
1802 
1803 #ifdef CONFIG_TCP_MD5SIG
1804 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1805 	/* Copy over the MD5 key from the original socket */
1806 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1807 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1808 	if (key && !tcp_rsk_used_ao(req)) {
1809 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1810 			goto put_and_exit;
1811 		sk_gso_disable(newsk);
1812 	}
1813 #endif
1814 #ifdef CONFIG_TCP_AO
1815 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1816 		goto put_and_exit; /* OOM, release back memory */
1817 #endif
1818 
1819 	if (__inet_inherit_port(sk, newsk) < 0)
1820 		goto put_and_exit;
1821 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1822 				       &found_dup_sk);
1823 	if (likely(*own_req)) {
1824 		tcp_move_syn(newtp, req);
1825 		ireq->ireq_opt = NULL;
1826 	} else {
1827 		newinet->inet_opt = NULL;
1828 
1829 		if (!req_unhash && found_dup_sk) {
1830 			/* This code path should only be executed in the
1831 			 * syncookie case only
1832 			 */
1833 			bh_unlock_sock(newsk);
1834 			sock_put(newsk);
1835 			newsk = NULL;
1836 		}
1837 	}
1838 	return newsk;
1839 
1840 exit_overflow:
1841 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1842 exit_nonewsk:
1843 	dst_release(dst);
1844 exit:
1845 	tcp_listendrop(sk);
1846 	return NULL;
1847 put_and_exit:
1848 	newinet->inet_opt = NULL;
1849 	inet_csk_prepare_forced_close(newsk);
1850 	tcp_done(newsk);
1851 	goto exit;
1852 }
1853 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1854 
1855 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1856 {
1857 #ifdef CONFIG_SYN_COOKIES
1858 	const struct tcphdr *th = tcp_hdr(skb);
1859 
1860 	if (!th->syn)
1861 		sk = cookie_v4_check(sk, skb);
1862 #endif
1863 	return sk;
1864 }
1865 
1866 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1867 			 struct tcphdr *th, u32 *cookie)
1868 {
1869 	u16 mss = 0;
1870 #ifdef CONFIG_SYN_COOKIES
1871 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1872 				    &tcp_request_sock_ipv4_ops, sk, th);
1873 	if (mss) {
1874 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1875 		tcp_synq_overflow(sk);
1876 	}
1877 #endif
1878 	return mss;
1879 }
1880 
1881 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1882 							   u32));
1883 /* The socket must have it's spinlock held when we get
1884  * here, unless it is a TCP_LISTEN socket.
1885  *
1886  * We have a potential double-lock case here, so even when
1887  * doing backlog processing we use the BH locking scheme.
1888  * This is because we cannot sleep with the original spinlock
1889  * held.
1890  */
1891 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1892 {
1893 	enum skb_drop_reason reason;
1894 	struct sock *rsk;
1895 
1896 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1897 		struct dst_entry *dst;
1898 
1899 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1900 						lockdep_sock_is_held(sk));
1901 
1902 		sock_rps_save_rxhash(sk, skb);
1903 		sk_mark_napi_id(sk, skb);
1904 		if (dst) {
1905 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1906 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1907 					     dst, 0)) {
1908 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1909 				dst_release(dst);
1910 			}
1911 		}
1912 		tcp_rcv_established(sk, skb);
1913 		return 0;
1914 	}
1915 
1916 	if (tcp_checksum_complete(skb))
1917 		goto csum_err;
1918 
1919 	if (sk->sk_state == TCP_LISTEN) {
1920 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1921 
1922 		if (!nsk)
1923 			return 0;
1924 		if (nsk != sk) {
1925 			reason = tcp_child_process(sk, nsk, skb);
1926 			if (reason) {
1927 				rsk = nsk;
1928 				goto reset;
1929 			}
1930 			return 0;
1931 		}
1932 	} else
1933 		sock_rps_save_rxhash(sk, skb);
1934 
1935 	reason = tcp_rcv_state_process(sk, skb);
1936 	if (reason) {
1937 		rsk = sk;
1938 		goto reset;
1939 	}
1940 	return 0;
1941 
1942 reset:
1943 	tcp_v4_send_reset(rsk, skb);
1944 discard:
1945 	kfree_skb_reason(skb, reason);
1946 	/* Be careful here. If this function gets more complicated and
1947 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1948 	 * might be destroyed here. This current version compiles correctly,
1949 	 * but you have been warned.
1950 	 */
1951 	return 0;
1952 
1953 csum_err:
1954 	reason = SKB_DROP_REASON_TCP_CSUM;
1955 	trace_tcp_bad_csum(skb);
1956 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1957 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1958 	goto discard;
1959 }
1960 EXPORT_SYMBOL(tcp_v4_do_rcv);
1961 
1962 int tcp_v4_early_demux(struct sk_buff *skb)
1963 {
1964 	struct net *net = dev_net(skb->dev);
1965 	const struct iphdr *iph;
1966 	const struct tcphdr *th;
1967 	struct sock *sk;
1968 
1969 	if (skb->pkt_type != PACKET_HOST)
1970 		return 0;
1971 
1972 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1973 		return 0;
1974 
1975 	iph = ip_hdr(skb);
1976 	th = tcp_hdr(skb);
1977 
1978 	if (th->doff < sizeof(struct tcphdr) / 4)
1979 		return 0;
1980 
1981 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1982 				       iph->saddr, th->source,
1983 				       iph->daddr, ntohs(th->dest),
1984 				       skb->skb_iif, inet_sdif(skb));
1985 	if (sk) {
1986 		skb->sk = sk;
1987 		skb->destructor = sock_edemux;
1988 		if (sk_fullsock(sk)) {
1989 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1990 
1991 			if (dst)
1992 				dst = dst_check(dst, 0);
1993 			if (dst &&
1994 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1995 				skb_dst_set_noref(skb, dst);
1996 		}
1997 	}
1998 	return 0;
1999 }
2000 
2001 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2002 		     enum skb_drop_reason *reason)
2003 {
2004 	u32 limit, tail_gso_size, tail_gso_segs;
2005 	struct skb_shared_info *shinfo;
2006 	const struct tcphdr *th;
2007 	struct tcphdr *thtail;
2008 	struct sk_buff *tail;
2009 	unsigned int hdrlen;
2010 	bool fragstolen;
2011 	u32 gso_segs;
2012 	u32 gso_size;
2013 	int delta;
2014 
2015 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2016 	 * we can fix skb->truesize to its real value to avoid future drops.
2017 	 * This is valid because skb is not yet charged to the socket.
2018 	 * It has been noticed pure SACK packets were sometimes dropped
2019 	 * (if cooked by drivers without copybreak feature).
2020 	 */
2021 	skb_condense(skb);
2022 
2023 	skb_dst_drop(skb);
2024 
2025 	if (unlikely(tcp_checksum_complete(skb))) {
2026 		bh_unlock_sock(sk);
2027 		trace_tcp_bad_csum(skb);
2028 		*reason = SKB_DROP_REASON_TCP_CSUM;
2029 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2030 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2031 		return true;
2032 	}
2033 
2034 	/* Attempt coalescing to last skb in backlog, even if we are
2035 	 * above the limits.
2036 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2037 	 */
2038 	th = (const struct tcphdr *)skb->data;
2039 	hdrlen = th->doff * 4;
2040 
2041 	tail = sk->sk_backlog.tail;
2042 	if (!tail)
2043 		goto no_coalesce;
2044 	thtail = (struct tcphdr *)tail->data;
2045 
2046 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2047 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2048 	    ((TCP_SKB_CB(tail)->tcp_flags |
2049 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2050 	    !((TCP_SKB_CB(tail)->tcp_flags &
2051 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2052 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2053 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2054 #ifdef CONFIG_TLS_DEVICE
2055 	    tail->decrypted != skb->decrypted ||
2056 #endif
2057 	    !mptcp_skb_can_collapse(tail, skb) ||
2058 	    thtail->doff != th->doff ||
2059 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2060 		goto no_coalesce;
2061 
2062 	__skb_pull(skb, hdrlen);
2063 
2064 	shinfo = skb_shinfo(skb);
2065 	gso_size = shinfo->gso_size ?: skb->len;
2066 	gso_segs = shinfo->gso_segs ?: 1;
2067 
2068 	shinfo = skb_shinfo(tail);
2069 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2070 	tail_gso_segs = shinfo->gso_segs ?: 1;
2071 
2072 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2073 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2074 
2075 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2076 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2077 			thtail->window = th->window;
2078 		}
2079 
2080 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2081 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2082 		 * is not entered if we append a packet with a FIN.
2083 		 * SYN, RST, URG are not present.
2084 		 * ACK is set on both packets.
2085 		 * PSH : we do not really care in TCP stack,
2086 		 *       at least for 'GRO' packets.
2087 		 */
2088 		thtail->fin |= th->fin;
2089 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2090 
2091 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2092 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2093 			tail->tstamp = skb->tstamp;
2094 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2095 		}
2096 
2097 		/* Not as strict as GRO. We only need to carry mss max value */
2098 		shinfo->gso_size = max(gso_size, tail_gso_size);
2099 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2100 
2101 		sk->sk_backlog.len += delta;
2102 		__NET_INC_STATS(sock_net(sk),
2103 				LINUX_MIB_TCPBACKLOGCOALESCE);
2104 		kfree_skb_partial(skb, fragstolen);
2105 		return false;
2106 	}
2107 	__skb_push(skb, hdrlen);
2108 
2109 no_coalesce:
2110 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
2111 
2112 	/* Only socket owner can try to collapse/prune rx queues
2113 	 * to reduce memory overhead, so add a little headroom here.
2114 	 * Few sockets backlog are possibly concurrently non empty.
2115 	 */
2116 	limit += 64 * 1024;
2117 
2118 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2119 		bh_unlock_sock(sk);
2120 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2121 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2122 		return true;
2123 	}
2124 	return false;
2125 }
2126 EXPORT_SYMBOL(tcp_add_backlog);
2127 
2128 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2129 {
2130 	struct tcphdr *th = (struct tcphdr *)skb->data;
2131 
2132 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2133 }
2134 EXPORT_SYMBOL(tcp_filter);
2135 
2136 static void tcp_v4_restore_cb(struct sk_buff *skb)
2137 {
2138 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2139 		sizeof(struct inet_skb_parm));
2140 }
2141 
2142 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2143 			   const struct tcphdr *th)
2144 {
2145 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2146 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2147 	 */
2148 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2149 		sizeof(struct inet_skb_parm));
2150 	barrier();
2151 
2152 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2153 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2154 				    skb->len - th->doff * 4);
2155 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2156 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2157 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
2158 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2159 	TCP_SKB_CB(skb)->sacked	 = 0;
2160 	TCP_SKB_CB(skb)->has_rxtstamp =
2161 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2162 }
2163 
2164 /*
2165  *	From tcp_input.c
2166  */
2167 
2168 int tcp_v4_rcv(struct sk_buff *skb)
2169 {
2170 	struct net *net = dev_net(skb->dev);
2171 	enum skb_drop_reason drop_reason;
2172 	int sdif = inet_sdif(skb);
2173 	int dif = inet_iif(skb);
2174 	const struct iphdr *iph;
2175 	const struct tcphdr *th;
2176 	bool refcounted;
2177 	struct sock *sk;
2178 	int ret;
2179 
2180 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2181 	if (skb->pkt_type != PACKET_HOST)
2182 		goto discard_it;
2183 
2184 	/* Count it even if it's bad */
2185 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2186 
2187 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2188 		goto discard_it;
2189 
2190 	th = (const struct tcphdr *)skb->data;
2191 
2192 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2193 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2194 		goto bad_packet;
2195 	}
2196 	if (!pskb_may_pull(skb, th->doff * 4))
2197 		goto discard_it;
2198 
2199 	/* An explanation is required here, I think.
2200 	 * Packet length and doff are validated by header prediction,
2201 	 * provided case of th->doff==0 is eliminated.
2202 	 * So, we defer the checks. */
2203 
2204 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2205 		goto csum_error;
2206 
2207 	th = (const struct tcphdr *)skb->data;
2208 	iph = ip_hdr(skb);
2209 lookup:
2210 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2211 			       skb, __tcp_hdrlen(th), th->source,
2212 			       th->dest, sdif, &refcounted);
2213 	if (!sk)
2214 		goto no_tcp_socket;
2215 
2216 process:
2217 	if (sk->sk_state == TCP_TIME_WAIT)
2218 		goto do_time_wait;
2219 
2220 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2221 		struct request_sock *req = inet_reqsk(sk);
2222 		bool req_stolen = false;
2223 		struct sock *nsk;
2224 
2225 		sk = req->rsk_listener;
2226 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2227 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2228 		else
2229 			drop_reason = tcp_inbound_hash(sk, req, skb,
2230 						       &iph->saddr, &iph->daddr,
2231 						       AF_INET, dif, sdif);
2232 		if (unlikely(drop_reason)) {
2233 			sk_drops_add(sk, skb);
2234 			reqsk_put(req);
2235 			goto discard_it;
2236 		}
2237 		if (tcp_checksum_complete(skb)) {
2238 			reqsk_put(req);
2239 			goto csum_error;
2240 		}
2241 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2242 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2243 			if (!nsk) {
2244 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2245 				goto lookup;
2246 			}
2247 			sk = nsk;
2248 			/* reuseport_migrate_sock() has already held one sk_refcnt
2249 			 * before returning.
2250 			 */
2251 		} else {
2252 			/* We own a reference on the listener, increase it again
2253 			 * as we might lose it too soon.
2254 			 */
2255 			sock_hold(sk);
2256 		}
2257 		refcounted = true;
2258 		nsk = NULL;
2259 		if (!tcp_filter(sk, skb)) {
2260 			th = (const struct tcphdr *)skb->data;
2261 			iph = ip_hdr(skb);
2262 			tcp_v4_fill_cb(skb, iph, th);
2263 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2264 		} else {
2265 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2266 		}
2267 		if (!nsk) {
2268 			reqsk_put(req);
2269 			if (req_stolen) {
2270 				/* Another cpu got exclusive access to req
2271 				 * and created a full blown socket.
2272 				 * Try to feed this packet to this socket
2273 				 * instead of discarding it.
2274 				 */
2275 				tcp_v4_restore_cb(skb);
2276 				sock_put(sk);
2277 				goto lookup;
2278 			}
2279 			goto discard_and_relse;
2280 		}
2281 		nf_reset_ct(skb);
2282 		if (nsk == sk) {
2283 			reqsk_put(req);
2284 			tcp_v4_restore_cb(skb);
2285 		} else {
2286 			drop_reason = tcp_child_process(sk, nsk, skb);
2287 			if (drop_reason) {
2288 				tcp_v4_send_reset(nsk, skb);
2289 				goto discard_and_relse;
2290 			}
2291 			sock_put(sk);
2292 			return 0;
2293 		}
2294 	}
2295 
2296 	if (static_branch_unlikely(&ip4_min_ttl)) {
2297 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2298 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2299 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2300 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2301 			goto discard_and_relse;
2302 		}
2303 	}
2304 
2305 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2306 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2307 		goto discard_and_relse;
2308 	}
2309 
2310 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2311 				       AF_INET, dif, sdif);
2312 	if (drop_reason)
2313 		goto discard_and_relse;
2314 
2315 	nf_reset_ct(skb);
2316 
2317 	if (tcp_filter(sk, skb)) {
2318 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2319 		goto discard_and_relse;
2320 	}
2321 	th = (const struct tcphdr *)skb->data;
2322 	iph = ip_hdr(skb);
2323 	tcp_v4_fill_cb(skb, iph, th);
2324 
2325 	skb->dev = NULL;
2326 
2327 	if (sk->sk_state == TCP_LISTEN) {
2328 		ret = tcp_v4_do_rcv(sk, skb);
2329 		goto put_and_return;
2330 	}
2331 
2332 	sk_incoming_cpu_update(sk);
2333 
2334 	bh_lock_sock_nested(sk);
2335 	tcp_segs_in(tcp_sk(sk), skb);
2336 	ret = 0;
2337 	if (!sock_owned_by_user(sk)) {
2338 		ret = tcp_v4_do_rcv(sk, skb);
2339 	} else {
2340 		if (tcp_add_backlog(sk, skb, &drop_reason))
2341 			goto discard_and_relse;
2342 	}
2343 	bh_unlock_sock(sk);
2344 
2345 put_and_return:
2346 	if (refcounted)
2347 		sock_put(sk);
2348 
2349 	return ret;
2350 
2351 no_tcp_socket:
2352 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2353 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2354 		goto discard_it;
2355 
2356 	tcp_v4_fill_cb(skb, iph, th);
2357 
2358 	if (tcp_checksum_complete(skb)) {
2359 csum_error:
2360 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2361 		trace_tcp_bad_csum(skb);
2362 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2363 bad_packet:
2364 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2365 	} else {
2366 		tcp_v4_send_reset(NULL, skb);
2367 	}
2368 
2369 discard_it:
2370 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2371 	/* Discard frame. */
2372 	kfree_skb_reason(skb, drop_reason);
2373 	return 0;
2374 
2375 discard_and_relse:
2376 	sk_drops_add(sk, skb);
2377 	if (refcounted)
2378 		sock_put(sk);
2379 	goto discard_it;
2380 
2381 do_time_wait:
2382 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2383 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2384 		inet_twsk_put(inet_twsk(sk));
2385 		goto discard_it;
2386 	}
2387 
2388 	tcp_v4_fill_cb(skb, iph, th);
2389 
2390 	if (tcp_checksum_complete(skb)) {
2391 		inet_twsk_put(inet_twsk(sk));
2392 		goto csum_error;
2393 	}
2394 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2395 	case TCP_TW_SYN: {
2396 		struct sock *sk2 = inet_lookup_listener(net,
2397 							net->ipv4.tcp_death_row.hashinfo,
2398 							skb, __tcp_hdrlen(th),
2399 							iph->saddr, th->source,
2400 							iph->daddr, th->dest,
2401 							inet_iif(skb),
2402 							sdif);
2403 		if (sk2) {
2404 			inet_twsk_deschedule_put(inet_twsk(sk));
2405 			sk = sk2;
2406 			tcp_v4_restore_cb(skb);
2407 			refcounted = false;
2408 			goto process;
2409 		}
2410 	}
2411 		/* to ACK */
2412 		fallthrough;
2413 	case TCP_TW_ACK:
2414 		tcp_v4_timewait_ack(sk, skb);
2415 		break;
2416 	case TCP_TW_RST:
2417 		tcp_v4_send_reset(sk, skb);
2418 		inet_twsk_deschedule_put(inet_twsk(sk));
2419 		goto discard_it;
2420 	case TCP_TW_SUCCESS:;
2421 	}
2422 	goto discard_it;
2423 }
2424 
2425 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2426 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2427 	.twsk_unique	= tcp_twsk_unique,
2428 	.twsk_destructor= tcp_twsk_destructor,
2429 };
2430 
2431 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2432 {
2433 	struct dst_entry *dst = skb_dst(skb);
2434 
2435 	if (dst && dst_hold_safe(dst)) {
2436 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2437 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2438 	}
2439 }
2440 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2441 
2442 const struct inet_connection_sock_af_ops ipv4_specific = {
2443 	.queue_xmit	   = ip_queue_xmit,
2444 	.send_check	   = tcp_v4_send_check,
2445 	.rebuild_header	   = inet_sk_rebuild_header,
2446 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2447 	.conn_request	   = tcp_v4_conn_request,
2448 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2449 	.net_header_len	   = sizeof(struct iphdr),
2450 	.setsockopt	   = ip_setsockopt,
2451 	.getsockopt	   = ip_getsockopt,
2452 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2453 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2454 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2455 };
2456 EXPORT_SYMBOL(ipv4_specific);
2457 
2458 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2459 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2460 #ifdef CONFIG_TCP_MD5SIG
2461 	.md5_lookup		= tcp_v4_md5_lookup,
2462 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2463 	.md5_parse		= tcp_v4_parse_md5_keys,
2464 #endif
2465 #ifdef CONFIG_TCP_AO
2466 	.ao_lookup		= tcp_v4_ao_lookup,
2467 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2468 	.ao_parse		= tcp_v4_parse_ao,
2469 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2470 #endif
2471 };
2472 #endif
2473 
2474 /* NOTE: A lot of things set to zero explicitly by call to
2475  *       sk_alloc() so need not be done here.
2476  */
2477 static int tcp_v4_init_sock(struct sock *sk)
2478 {
2479 	struct inet_connection_sock *icsk = inet_csk(sk);
2480 
2481 	tcp_init_sock(sk);
2482 
2483 	icsk->icsk_af_ops = &ipv4_specific;
2484 
2485 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2486 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2487 #endif
2488 
2489 	return 0;
2490 }
2491 
2492 #ifdef CONFIG_TCP_MD5SIG
2493 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2494 {
2495 	struct tcp_md5sig_info *md5sig;
2496 
2497 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2498 	kfree(md5sig);
2499 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2500 	tcp_md5_release_sigpool();
2501 }
2502 #endif
2503 
2504 void tcp_v4_destroy_sock(struct sock *sk)
2505 {
2506 	struct tcp_sock *tp = tcp_sk(sk);
2507 
2508 	trace_tcp_destroy_sock(sk);
2509 
2510 	tcp_clear_xmit_timers(sk);
2511 
2512 	tcp_cleanup_congestion_control(sk);
2513 
2514 	tcp_cleanup_ulp(sk);
2515 
2516 	/* Cleanup up the write buffer. */
2517 	tcp_write_queue_purge(sk);
2518 
2519 	/* Check if we want to disable active TFO */
2520 	tcp_fastopen_active_disable_ofo_check(sk);
2521 
2522 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2523 	skb_rbtree_purge(&tp->out_of_order_queue);
2524 
2525 #ifdef CONFIG_TCP_MD5SIG
2526 	/* Clean up the MD5 key list, if any */
2527 	if (tp->md5sig_info) {
2528 		struct tcp_md5sig_info *md5sig;
2529 
2530 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2531 		tcp_clear_md5_list(sk);
2532 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2533 		rcu_assign_pointer(tp->md5sig_info, NULL);
2534 	}
2535 #endif
2536 	tcp_ao_destroy_sock(sk, false);
2537 
2538 	/* Clean up a referenced TCP bind bucket. */
2539 	if (inet_csk(sk)->icsk_bind_hash)
2540 		inet_put_port(sk);
2541 
2542 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2543 
2544 	/* If socket is aborted during connect operation */
2545 	tcp_free_fastopen_req(tp);
2546 	tcp_fastopen_destroy_cipher(sk);
2547 	tcp_saved_syn_free(tp);
2548 
2549 	sk_sockets_allocated_dec(sk);
2550 }
2551 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2552 
2553 #ifdef CONFIG_PROC_FS
2554 /* Proc filesystem TCP sock list dumping. */
2555 
2556 static unsigned short seq_file_family(const struct seq_file *seq);
2557 
2558 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2559 {
2560 	unsigned short family = seq_file_family(seq);
2561 
2562 	/* AF_UNSPEC is used as a match all */
2563 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2564 		net_eq(sock_net(sk), seq_file_net(seq)));
2565 }
2566 
2567 /* Find a non empty bucket (starting from st->bucket)
2568  * and return the first sk from it.
2569  */
2570 static void *listening_get_first(struct seq_file *seq)
2571 {
2572 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2573 	struct tcp_iter_state *st = seq->private;
2574 
2575 	st->offset = 0;
2576 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2577 		struct inet_listen_hashbucket *ilb2;
2578 		struct hlist_nulls_node *node;
2579 		struct sock *sk;
2580 
2581 		ilb2 = &hinfo->lhash2[st->bucket];
2582 		if (hlist_nulls_empty(&ilb2->nulls_head))
2583 			continue;
2584 
2585 		spin_lock(&ilb2->lock);
2586 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2587 			if (seq_sk_match(seq, sk))
2588 				return sk;
2589 		}
2590 		spin_unlock(&ilb2->lock);
2591 	}
2592 
2593 	return NULL;
2594 }
2595 
2596 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2597  * If "cur" is the last one in the st->bucket,
2598  * call listening_get_first() to return the first sk of the next
2599  * non empty bucket.
2600  */
2601 static void *listening_get_next(struct seq_file *seq, void *cur)
2602 {
2603 	struct tcp_iter_state *st = seq->private;
2604 	struct inet_listen_hashbucket *ilb2;
2605 	struct hlist_nulls_node *node;
2606 	struct inet_hashinfo *hinfo;
2607 	struct sock *sk = cur;
2608 
2609 	++st->num;
2610 	++st->offset;
2611 
2612 	sk = sk_nulls_next(sk);
2613 	sk_nulls_for_each_from(sk, node) {
2614 		if (seq_sk_match(seq, sk))
2615 			return sk;
2616 	}
2617 
2618 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2619 	ilb2 = &hinfo->lhash2[st->bucket];
2620 	spin_unlock(&ilb2->lock);
2621 	++st->bucket;
2622 	return listening_get_first(seq);
2623 }
2624 
2625 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2626 {
2627 	struct tcp_iter_state *st = seq->private;
2628 	void *rc;
2629 
2630 	st->bucket = 0;
2631 	st->offset = 0;
2632 	rc = listening_get_first(seq);
2633 
2634 	while (rc && *pos) {
2635 		rc = listening_get_next(seq, rc);
2636 		--*pos;
2637 	}
2638 	return rc;
2639 }
2640 
2641 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2642 				const struct tcp_iter_state *st)
2643 {
2644 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2645 }
2646 
2647 /*
2648  * Get first established socket starting from bucket given in st->bucket.
2649  * If st->bucket is zero, the very first socket in the hash is returned.
2650  */
2651 static void *established_get_first(struct seq_file *seq)
2652 {
2653 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2654 	struct tcp_iter_state *st = seq->private;
2655 
2656 	st->offset = 0;
2657 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2658 		struct sock *sk;
2659 		struct hlist_nulls_node *node;
2660 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2661 
2662 		cond_resched();
2663 
2664 		/* Lockless fast path for the common case of empty buckets */
2665 		if (empty_bucket(hinfo, st))
2666 			continue;
2667 
2668 		spin_lock_bh(lock);
2669 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2670 			if (seq_sk_match(seq, sk))
2671 				return sk;
2672 		}
2673 		spin_unlock_bh(lock);
2674 	}
2675 
2676 	return NULL;
2677 }
2678 
2679 static void *established_get_next(struct seq_file *seq, void *cur)
2680 {
2681 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2682 	struct tcp_iter_state *st = seq->private;
2683 	struct hlist_nulls_node *node;
2684 	struct sock *sk = cur;
2685 
2686 	++st->num;
2687 	++st->offset;
2688 
2689 	sk = sk_nulls_next(sk);
2690 
2691 	sk_nulls_for_each_from(sk, node) {
2692 		if (seq_sk_match(seq, sk))
2693 			return sk;
2694 	}
2695 
2696 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2697 	++st->bucket;
2698 	return established_get_first(seq);
2699 }
2700 
2701 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2702 {
2703 	struct tcp_iter_state *st = seq->private;
2704 	void *rc;
2705 
2706 	st->bucket = 0;
2707 	rc = established_get_first(seq);
2708 
2709 	while (rc && pos) {
2710 		rc = established_get_next(seq, rc);
2711 		--pos;
2712 	}
2713 	return rc;
2714 }
2715 
2716 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2717 {
2718 	void *rc;
2719 	struct tcp_iter_state *st = seq->private;
2720 
2721 	st->state = TCP_SEQ_STATE_LISTENING;
2722 	rc	  = listening_get_idx(seq, &pos);
2723 
2724 	if (!rc) {
2725 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2726 		rc	  = established_get_idx(seq, pos);
2727 	}
2728 
2729 	return rc;
2730 }
2731 
2732 static void *tcp_seek_last_pos(struct seq_file *seq)
2733 {
2734 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2735 	struct tcp_iter_state *st = seq->private;
2736 	int bucket = st->bucket;
2737 	int offset = st->offset;
2738 	int orig_num = st->num;
2739 	void *rc = NULL;
2740 
2741 	switch (st->state) {
2742 	case TCP_SEQ_STATE_LISTENING:
2743 		if (st->bucket > hinfo->lhash2_mask)
2744 			break;
2745 		rc = listening_get_first(seq);
2746 		while (offset-- && rc && bucket == st->bucket)
2747 			rc = listening_get_next(seq, rc);
2748 		if (rc)
2749 			break;
2750 		st->bucket = 0;
2751 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2752 		fallthrough;
2753 	case TCP_SEQ_STATE_ESTABLISHED:
2754 		if (st->bucket > hinfo->ehash_mask)
2755 			break;
2756 		rc = established_get_first(seq);
2757 		while (offset-- && rc && bucket == st->bucket)
2758 			rc = established_get_next(seq, rc);
2759 	}
2760 
2761 	st->num = orig_num;
2762 
2763 	return rc;
2764 }
2765 
2766 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2767 {
2768 	struct tcp_iter_state *st = seq->private;
2769 	void *rc;
2770 
2771 	if (*pos && *pos == st->last_pos) {
2772 		rc = tcp_seek_last_pos(seq);
2773 		if (rc)
2774 			goto out;
2775 	}
2776 
2777 	st->state = TCP_SEQ_STATE_LISTENING;
2778 	st->num = 0;
2779 	st->bucket = 0;
2780 	st->offset = 0;
2781 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2782 
2783 out:
2784 	st->last_pos = *pos;
2785 	return rc;
2786 }
2787 EXPORT_SYMBOL(tcp_seq_start);
2788 
2789 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2790 {
2791 	struct tcp_iter_state *st = seq->private;
2792 	void *rc = NULL;
2793 
2794 	if (v == SEQ_START_TOKEN) {
2795 		rc = tcp_get_idx(seq, 0);
2796 		goto out;
2797 	}
2798 
2799 	switch (st->state) {
2800 	case TCP_SEQ_STATE_LISTENING:
2801 		rc = listening_get_next(seq, v);
2802 		if (!rc) {
2803 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2804 			st->bucket = 0;
2805 			st->offset = 0;
2806 			rc	  = established_get_first(seq);
2807 		}
2808 		break;
2809 	case TCP_SEQ_STATE_ESTABLISHED:
2810 		rc = established_get_next(seq, v);
2811 		break;
2812 	}
2813 out:
2814 	++*pos;
2815 	st->last_pos = *pos;
2816 	return rc;
2817 }
2818 EXPORT_SYMBOL(tcp_seq_next);
2819 
2820 void tcp_seq_stop(struct seq_file *seq, void *v)
2821 {
2822 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2823 	struct tcp_iter_state *st = seq->private;
2824 
2825 	switch (st->state) {
2826 	case TCP_SEQ_STATE_LISTENING:
2827 		if (v != SEQ_START_TOKEN)
2828 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2829 		break;
2830 	case TCP_SEQ_STATE_ESTABLISHED:
2831 		if (v)
2832 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2833 		break;
2834 	}
2835 }
2836 EXPORT_SYMBOL(tcp_seq_stop);
2837 
2838 static void get_openreq4(const struct request_sock *req,
2839 			 struct seq_file *f, int i)
2840 {
2841 	const struct inet_request_sock *ireq = inet_rsk(req);
2842 	long delta = req->rsk_timer.expires - jiffies;
2843 
2844 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2845 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2846 		i,
2847 		ireq->ir_loc_addr,
2848 		ireq->ir_num,
2849 		ireq->ir_rmt_addr,
2850 		ntohs(ireq->ir_rmt_port),
2851 		TCP_SYN_RECV,
2852 		0, 0, /* could print option size, but that is af dependent. */
2853 		1,    /* timers active (only the expire timer) */
2854 		jiffies_delta_to_clock_t(delta),
2855 		req->num_timeout,
2856 		from_kuid_munged(seq_user_ns(f),
2857 				 sock_i_uid(req->rsk_listener)),
2858 		0,  /* non standard timer */
2859 		0, /* open_requests have no inode */
2860 		0,
2861 		req);
2862 }
2863 
2864 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2865 {
2866 	int timer_active;
2867 	unsigned long timer_expires;
2868 	const struct tcp_sock *tp = tcp_sk(sk);
2869 	const struct inet_connection_sock *icsk = inet_csk(sk);
2870 	const struct inet_sock *inet = inet_sk(sk);
2871 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2872 	__be32 dest = inet->inet_daddr;
2873 	__be32 src = inet->inet_rcv_saddr;
2874 	__u16 destp = ntohs(inet->inet_dport);
2875 	__u16 srcp = ntohs(inet->inet_sport);
2876 	int rx_queue;
2877 	int state;
2878 
2879 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2880 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2881 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2882 		timer_active	= 1;
2883 		timer_expires	= icsk->icsk_timeout;
2884 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2885 		timer_active	= 4;
2886 		timer_expires	= icsk->icsk_timeout;
2887 	} else if (timer_pending(&sk->sk_timer)) {
2888 		timer_active	= 2;
2889 		timer_expires	= sk->sk_timer.expires;
2890 	} else {
2891 		timer_active	= 0;
2892 		timer_expires = jiffies;
2893 	}
2894 
2895 	state = inet_sk_state_load(sk);
2896 	if (state == TCP_LISTEN)
2897 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2898 	else
2899 		/* Because we don't lock the socket,
2900 		 * we might find a transient negative value.
2901 		 */
2902 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2903 				      READ_ONCE(tp->copied_seq), 0);
2904 
2905 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2906 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2907 		i, src, srcp, dest, destp, state,
2908 		READ_ONCE(tp->write_seq) - tp->snd_una,
2909 		rx_queue,
2910 		timer_active,
2911 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2912 		icsk->icsk_retransmits,
2913 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2914 		icsk->icsk_probes_out,
2915 		sock_i_ino(sk),
2916 		refcount_read(&sk->sk_refcnt), sk,
2917 		jiffies_to_clock_t(icsk->icsk_rto),
2918 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2919 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2920 		tcp_snd_cwnd(tp),
2921 		state == TCP_LISTEN ?
2922 		    fastopenq->max_qlen :
2923 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2924 }
2925 
2926 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2927 			       struct seq_file *f, int i)
2928 {
2929 	long delta = tw->tw_timer.expires - jiffies;
2930 	__be32 dest, src;
2931 	__u16 destp, srcp;
2932 
2933 	dest  = tw->tw_daddr;
2934 	src   = tw->tw_rcv_saddr;
2935 	destp = ntohs(tw->tw_dport);
2936 	srcp  = ntohs(tw->tw_sport);
2937 
2938 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2939 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2940 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2941 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2942 		refcount_read(&tw->tw_refcnt), tw);
2943 }
2944 
2945 #define TMPSZ 150
2946 
2947 static int tcp4_seq_show(struct seq_file *seq, void *v)
2948 {
2949 	struct tcp_iter_state *st;
2950 	struct sock *sk = v;
2951 
2952 	seq_setwidth(seq, TMPSZ - 1);
2953 	if (v == SEQ_START_TOKEN) {
2954 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2955 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2956 			   "inode");
2957 		goto out;
2958 	}
2959 	st = seq->private;
2960 
2961 	if (sk->sk_state == TCP_TIME_WAIT)
2962 		get_timewait4_sock(v, seq, st->num);
2963 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2964 		get_openreq4(v, seq, st->num);
2965 	else
2966 		get_tcp4_sock(v, seq, st->num);
2967 out:
2968 	seq_pad(seq, '\n');
2969 	return 0;
2970 }
2971 
2972 #ifdef CONFIG_BPF_SYSCALL
2973 struct bpf_tcp_iter_state {
2974 	struct tcp_iter_state state;
2975 	unsigned int cur_sk;
2976 	unsigned int end_sk;
2977 	unsigned int max_sk;
2978 	struct sock **batch;
2979 	bool st_bucket_done;
2980 };
2981 
2982 struct bpf_iter__tcp {
2983 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2984 	__bpf_md_ptr(struct sock_common *, sk_common);
2985 	uid_t uid __aligned(8);
2986 };
2987 
2988 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2989 			     struct sock_common *sk_common, uid_t uid)
2990 {
2991 	struct bpf_iter__tcp ctx;
2992 
2993 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2994 	ctx.meta = meta;
2995 	ctx.sk_common = sk_common;
2996 	ctx.uid = uid;
2997 	return bpf_iter_run_prog(prog, &ctx);
2998 }
2999 
3000 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3001 {
3002 	while (iter->cur_sk < iter->end_sk)
3003 		sock_gen_put(iter->batch[iter->cur_sk++]);
3004 }
3005 
3006 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3007 				      unsigned int new_batch_sz)
3008 {
3009 	struct sock **new_batch;
3010 
3011 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3012 			     GFP_USER | __GFP_NOWARN);
3013 	if (!new_batch)
3014 		return -ENOMEM;
3015 
3016 	bpf_iter_tcp_put_batch(iter);
3017 	kvfree(iter->batch);
3018 	iter->batch = new_batch;
3019 	iter->max_sk = new_batch_sz;
3020 
3021 	return 0;
3022 }
3023 
3024 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3025 						 struct sock *start_sk)
3026 {
3027 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3028 	struct bpf_tcp_iter_state *iter = seq->private;
3029 	struct tcp_iter_state *st = &iter->state;
3030 	struct hlist_nulls_node *node;
3031 	unsigned int expected = 1;
3032 	struct sock *sk;
3033 
3034 	sock_hold(start_sk);
3035 	iter->batch[iter->end_sk++] = start_sk;
3036 
3037 	sk = sk_nulls_next(start_sk);
3038 	sk_nulls_for_each_from(sk, node) {
3039 		if (seq_sk_match(seq, sk)) {
3040 			if (iter->end_sk < iter->max_sk) {
3041 				sock_hold(sk);
3042 				iter->batch[iter->end_sk++] = sk;
3043 			}
3044 			expected++;
3045 		}
3046 	}
3047 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3048 
3049 	return expected;
3050 }
3051 
3052 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3053 						   struct sock *start_sk)
3054 {
3055 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3056 	struct bpf_tcp_iter_state *iter = seq->private;
3057 	struct tcp_iter_state *st = &iter->state;
3058 	struct hlist_nulls_node *node;
3059 	unsigned int expected = 1;
3060 	struct sock *sk;
3061 
3062 	sock_hold(start_sk);
3063 	iter->batch[iter->end_sk++] = start_sk;
3064 
3065 	sk = sk_nulls_next(start_sk);
3066 	sk_nulls_for_each_from(sk, node) {
3067 		if (seq_sk_match(seq, sk)) {
3068 			if (iter->end_sk < iter->max_sk) {
3069 				sock_hold(sk);
3070 				iter->batch[iter->end_sk++] = sk;
3071 			}
3072 			expected++;
3073 		}
3074 	}
3075 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3076 
3077 	return expected;
3078 }
3079 
3080 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3081 {
3082 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3083 	struct bpf_tcp_iter_state *iter = seq->private;
3084 	struct tcp_iter_state *st = &iter->state;
3085 	unsigned int expected;
3086 	bool resized = false;
3087 	struct sock *sk;
3088 
3089 	/* The st->bucket is done.  Directly advance to the next
3090 	 * bucket instead of having the tcp_seek_last_pos() to skip
3091 	 * one by one in the current bucket and eventually find out
3092 	 * it has to advance to the next bucket.
3093 	 */
3094 	if (iter->st_bucket_done) {
3095 		st->offset = 0;
3096 		st->bucket++;
3097 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3098 		    st->bucket > hinfo->lhash2_mask) {
3099 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3100 			st->bucket = 0;
3101 		}
3102 	}
3103 
3104 again:
3105 	/* Get a new batch */
3106 	iter->cur_sk = 0;
3107 	iter->end_sk = 0;
3108 	iter->st_bucket_done = false;
3109 
3110 	sk = tcp_seek_last_pos(seq);
3111 	if (!sk)
3112 		return NULL; /* Done */
3113 
3114 	if (st->state == TCP_SEQ_STATE_LISTENING)
3115 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3116 	else
3117 		expected = bpf_iter_tcp_established_batch(seq, sk);
3118 
3119 	if (iter->end_sk == expected) {
3120 		iter->st_bucket_done = true;
3121 		return sk;
3122 	}
3123 
3124 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3125 		resized = true;
3126 		goto again;
3127 	}
3128 
3129 	return sk;
3130 }
3131 
3132 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3133 {
3134 	/* bpf iter does not support lseek, so it always
3135 	 * continue from where it was stop()-ped.
3136 	 */
3137 	if (*pos)
3138 		return bpf_iter_tcp_batch(seq);
3139 
3140 	return SEQ_START_TOKEN;
3141 }
3142 
3143 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3144 {
3145 	struct bpf_tcp_iter_state *iter = seq->private;
3146 	struct tcp_iter_state *st = &iter->state;
3147 	struct sock *sk;
3148 
3149 	/* Whenever seq_next() is called, the iter->cur_sk is
3150 	 * done with seq_show(), so advance to the next sk in
3151 	 * the batch.
3152 	 */
3153 	if (iter->cur_sk < iter->end_sk) {
3154 		/* Keeping st->num consistent in tcp_iter_state.
3155 		 * bpf_iter_tcp does not use st->num.
3156 		 * meta.seq_num is used instead.
3157 		 */
3158 		st->num++;
3159 		/* Move st->offset to the next sk in the bucket such that
3160 		 * the future start() will resume at st->offset in
3161 		 * st->bucket.  See tcp_seek_last_pos().
3162 		 */
3163 		st->offset++;
3164 		sock_gen_put(iter->batch[iter->cur_sk++]);
3165 	}
3166 
3167 	if (iter->cur_sk < iter->end_sk)
3168 		sk = iter->batch[iter->cur_sk];
3169 	else
3170 		sk = bpf_iter_tcp_batch(seq);
3171 
3172 	++*pos;
3173 	/* Keeping st->last_pos consistent in tcp_iter_state.
3174 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3175 	 */
3176 	st->last_pos = *pos;
3177 	return sk;
3178 }
3179 
3180 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3181 {
3182 	struct bpf_iter_meta meta;
3183 	struct bpf_prog *prog;
3184 	struct sock *sk = v;
3185 	uid_t uid;
3186 	int ret;
3187 
3188 	if (v == SEQ_START_TOKEN)
3189 		return 0;
3190 
3191 	if (sk_fullsock(sk))
3192 		lock_sock(sk);
3193 
3194 	if (unlikely(sk_unhashed(sk))) {
3195 		ret = SEQ_SKIP;
3196 		goto unlock;
3197 	}
3198 
3199 	if (sk->sk_state == TCP_TIME_WAIT) {
3200 		uid = 0;
3201 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3202 		const struct request_sock *req = v;
3203 
3204 		uid = from_kuid_munged(seq_user_ns(seq),
3205 				       sock_i_uid(req->rsk_listener));
3206 	} else {
3207 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3208 	}
3209 
3210 	meta.seq = seq;
3211 	prog = bpf_iter_get_info(&meta, false);
3212 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3213 
3214 unlock:
3215 	if (sk_fullsock(sk))
3216 		release_sock(sk);
3217 	return ret;
3218 
3219 }
3220 
3221 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3222 {
3223 	struct bpf_tcp_iter_state *iter = seq->private;
3224 	struct bpf_iter_meta meta;
3225 	struct bpf_prog *prog;
3226 
3227 	if (!v) {
3228 		meta.seq = seq;
3229 		prog = bpf_iter_get_info(&meta, true);
3230 		if (prog)
3231 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3232 	}
3233 
3234 	if (iter->cur_sk < iter->end_sk) {
3235 		bpf_iter_tcp_put_batch(iter);
3236 		iter->st_bucket_done = false;
3237 	}
3238 }
3239 
3240 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3241 	.show		= bpf_iter_tcp_seq_show,
3242 	.start		= bpf_iter_tcp_seq_start,
3243 	.next		= bpf_iter_tcp_seq_next,
3244 	.stop		= bpf_iter_tcp_seq_stop,
3245 };
3246 #endif
3247 static unsigned short seq_file_family(const struct seq_file *seq)
3248 {
3249 	const struct tcp_seq_afinfo *afinfo;
3250 
3251 #ifdef CONFIG_BPF_SYSCALL
3252 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3253 	if (seq->op == &bpf_iter_tcp_seq_ops)
3254 		return AF_UNSPEC;
3255 #endif
3256 
3257 	/* Iterated from proc fs */
3258 	afinfo = pde_data(file_inode(seq->file));
3259 	return afinfo->family;
3260 }
3261 
3262 static const struct seq_operations tcp4_seq_ops = {
3263 	.show		= tcp4_seq_show,
3264 	.start		= tcp_seq_start,
3265 	.next		= tcp_seq_next,
3266 	.stop		= tcp_seq_stop,
3267 };
3268 
3269 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3270 	.family		= AF_INET,
3271 };
3272 
3273 static int __net_init tcp4_proc_init_net(struct net *net)
3274 {
3275 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3276 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3277 		return -ENOMEM;
3278 	return 0;
3279 }
3280 
3281 static void __net_exit tcp4_proc_exit_net(struct net *net)
3282 {
3283 	remove_proc_entry("tcp", net->proc_net);
3284 }
3285 
3286 static struct pernet_operations tcp4_net_ops = {
3287 	.init = tcp4_proc_init_net,
3288 	.exit = tcp4_proc_exit_net,
3289 };
3290 
3291 int __init tcp4_proc_init(void)
3292 {
3293 	return register_pernet_subsys(&tcp4_net_ops);
3294 }
3295 
3296 void tcp4_proc_exit(void)
3297 {
3298 	unregister_pernet_subsys(&tcp4_net_ops);
3299 }
3300 #endif /* CONFIG_PROC_FS */
3301 
3302 /* @wake is one when sk_stream_write_space() calls us.
3303  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3304  * This mimics the strategy used in sock_def_write_space().
3305  */
3306 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3307 {
3308 	const struct tcp_sock *tp = tcp_sk(sk);
3309 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3310 			    READ_ONCE(tp->snd_nxt);
3311 
3312 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3313 }
3314 EXPORT_SYMBOL(tcp_stream_memory_free);
3315 
3316 struct proto tcp_prot = {
3317 	.name			= "TCP",
3318 	.owner			= THIS_MODULE,
3319 	.close			= tcp_close,
3320 	.pre_connect		= tcp_v4_pre_connect,
3321 	.connect		= tcp_v4_connect,
3322 	.disconnect		= tcp_disconnect,
3323 	.accept			= inet_csk_accept,
3324 	.ioctl			= tcp_ioctl,
3325 	.init			= tcp_v4_init_sock,
3326 	.destroy		= tcp_v4_destroy_sock,
3327 	.shutdown		= tcp_shutdown,
3328 	.setsockopt		= tcp_setsockopt,
3329 	.getsockopt		= tcp_getsockopt,
3330 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3331 	.keepalive		= tcp_set_keepalive,
3332 	.recvmsg		= tcp_recvmsg,
3333 	.sendmsg		= tcp_sendmsg,
3334 	.splice_eof		= tcp_splice_eof,
3335 	.backlog_rcv		= tcp_v4_do_rcv,
3336 	.release_cb		= tcp_release_cb,
3337 	.hash			= inet_hash,
3338 	.unhash			= inet_unhash,
3339 	.get_port		= inet_csk_get_port,
3340 	.put_port		= inet_put_port,
3341 #ifdef CONFIG_BPF_SYSCALL
3342 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3343 #endif
3344 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3345 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3346 	.stream_memory_free	= tcp_stream_memory_free,
3347 	.sockets_allocated	= &tcp_sockets_allocated,
3348 	.orphan_count		= &tcp_orphan_count,
3349 
3350 	.memory_allocated	= &tcp_memory_allocated,
3351 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3352 
3353 	.memory_pressure	= &tcp_memory_pressure,
3354 	.sysctl_mem		= sysctl_tcp_mem,
3355 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3356 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3357 	.max_header		= MAX_TCP_HEADER,
3358 	.obj_size		= sizeof(struct tcp_sock),
3359 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3360 	.twsk_prot		= &tcp_timewait_sock_ops,
3361 	.rsk_prot		= &tcp_request_sock_ops,
3362 	.h.hashinfo		= NULL,
3363 	.no_autobind		= true,
3364 	.diag_destroy		= tcp_abort,
3365 };
3366 EXPORT_SYMBOL(tcp_prot);
3367 
3368 static void __net_exit tcp_sk_exit(struct net *net)
3369 {
3370 	if (net->ipv4.tcp_congestion_control)
3371 		bpf_module_put(net->ipv4.tcp_congestion_control,
3372 			       net->ipv4.tcp_congestion_control->owner);
3373 }
3374 
3375 static void __net_init tcp_set_hashinfo(struct net *net)
3376 {
3377 	struct inet_hashinfo *hinfo;
3378 	unsigned int ehash_entries;
3379 	struct net *old_net;
3380 
3381 	if (net_eq(net, &init_net))
3382 		goto fallback;
3383 
3384 	old_net = current->nsproxy->net_ns;
3385 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3386 	if (!ehash_entries)
3387 		goto fallback;
3388 
3389 	ehash_entries = roundup_pow_of_two(ehash_entries);
3390 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3391 	if (!hinfo) {
3392 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3393 			"for a netns, fallback to the global one\n",
3394 			ehash_entries);
3395 fallback:
3396 		hinfo = &tcp_hashinfo;
3397 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3398 	}
3399 
3400 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3401 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3402 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3403 }
3404 
3405 static int __net_init tcp_sk_init(struct net *net)
3406 {
3407 	net->ipv4.sysctl_tcp_ecn = 2;
3408 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3409 
3410 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3411 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3412 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3413 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3414 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3415 
3416 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3417 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3418 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3419 
3420 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3421 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3422 	net->ipv4.sysctl_tcp_syncookies = 1;
3423 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3424 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3425 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3426 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3427 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3428 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3429 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3430 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3431 
3432 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3433 	tcp_set_hashinfo(net);
3434 
3435 	net->ipv4.sysctl_tcp_sack = 1;
3436 	net->ipv4.sysctl_tcp_window_scaling = 1;
3437 	net->ipv4.sysctl_tcp_timestamps = 1;
3438 	net->ipv4.sysctl_tcp_early_retrans = 3;
3439 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3440 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3441 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3442 	net->ipv4.sysctl_tcp_max_reordering = 300;
3443 	net->ipv4.sysctl_tcp_dsack = 1;
3444 	net->ipv4.sysctl_tcp_app_win = 31;
3445 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3446 	net->ipv4.sysctl_tcp_frto = 2;
3447 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3448 	/* This limits the percentage of the congestion window which we
3449 	 * will allow a single TSO frame to consume.  Building TSO frames
3450 	 * which are too large can cause TCP streams to be bursty.
3451 	 */
3452 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3453 	/* Default TSQ limit of 16 TSO segments */
3454 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3455 
3456 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3457 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3458 
3459 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3460 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3461 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3462 	net->ipv4.sysctl_tcp_autocorking = 1;
3463 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3464 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3465 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3466 	if (net != &init_net) {
3467 		memcpy(net->ipv4.sysctl_tcp_rmem,
3468 		       init_net.ipv4.sysctl_tcp_rmem,
3469 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3470 		memcpy(net->ipv4.sysctl_tcp_wmem,
3471 		       init_net.ipv4.sysctl_tcp_wmem,
3472 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3473 	}
3474 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3475 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3476 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3477 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3478 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3479 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3480 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3481 
3482 	/* Set default values for PLB */
3483 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3484 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3485 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3486 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3487 	/* Default congestion threshold for PLB to mark a round is 50% */
3488 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3489 
3490 	/* Reno is always built in */
3491 	if (!net_eq(net, &init_net) &&
3492 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3493 			       init_net.ipv4.tcp_congestion_control->owner))
3494 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3495 	else
3496 		net->ipv4.tcp_congestion_control = &tcp_reno;
3497 
3498 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3499 	net->ipv4.sysctl_tcp_shrink_window = 0;
3500 
3501 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3502 
3503 	return 0;
3504 }
3505 
3506 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3507 {
3508 	struct net *net;
3509 
3510 	tcp_twsk_purge(net_exit_list, AF_INET);
3511 
3512 	list_for_each_entry(net, net_exit_list, exit_list) {
3513 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3514 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3515 		tcp_fastopen_ctx_destroy(net);
3516 	}
3517 }
3518 
3519 static struct pernet_operations __net_initdata tcp_sk_ops = {
3520        .init	   = tcp_sk_init,
3521        .exit	   = tcp_sk_exit,
3522        .exit_batch = tcp_sk_exit_batch,
3523 };
3524 
3525 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3526 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3527 		     struct sock_common *sk_common, uid_t uid)
3528 
3529 #define INIT_BATCH_SZ 16
3530 
3531 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3532 {
3533 	struct bpf_tcp_iter_state *iter = priv_data;
3534 	int err;
3535 
3536 	err = bpf_iter_init_seq_net(priv_data, aux);
3537 	if (err)
3538 		return err;
3539 
3540 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3541 	if (err) {
3542 		bpf_iter_fini_seq_net(priv_data);
3543 		return err;
3544 	}
3545 
3546 	return 0;
3547 }
3548 
3549 static void bpf_iter_fini_tcp(void *priv_data)
3550 {
3551 	struct bpf_tcp_iter_state *iter = priv_data;
3552 
3553 	bpf_iter_fini_seq_net(priv_data);
3554 	kvfree(iter->batch);
3555 }
3556 
3557 static const struct bpf_iter_seq_info tcp_seq_info = {
3558 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3559 	.init_seq_private	= bpf_iter_init_tcp,
3560 	.fini_seq_private	= bpf_iter_fini_tcp,
3561 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3562 };
3563 
3564 static const struct bpf_func_proto *
3565 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3566 			    const struct bpf_prog *prog)
3567 {
3568 	switch (func_id) {
3569 	case BPF_FUNC_setsockopt:
3570 		return &bpf_sk_setsockopt_proto;
3571 	case BPF_FUNC_getsockopt:
3572 		return &bpf_sk_getsockopt_proto;
3573 	default:
3574 		return NULL;
3575 	}
3576 }
3577 
3578 static struct bpf_iter_reg tcp_reg_info = {
3579 	.target			= "tcp",
3580 	.ctx_arg_info_size	= 1,
3581 	.ctx_arg_info		= {
3582 		{ offsetof(struct bpf_iter__tcp, sk_common),
3583 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3584 	},
3585 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3586 	.seq_info		= &tcp_seq_info,
3587 };
3588 
3589 static void __init bpf_iter_register(void)
3590 {
3591 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3592 	if (bpf_iter_reg_target(&tcp_reg_info))
3593 		pr_warn("Warning: could not register bpf iterator tcp\n");
3594 }
3595 
3596 #endif
3597 
3598 void __init tcp_v4_init(void)
3599 {
3600 	int cpu, res;
3601 
3602 	for_each_possible_cpu(cpu) {
3603 		struct sock *sk;
3604 
3605 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3606 					   IPPROTO_TCP, &init_net);
3607 		if (res)
3608 			panic("Failed to create the TCP control socket.\n");
3609 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3610 
3611 		/* Please enforce IP_DF and IPID==0 for RST and
3612 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3613 		 */
3614 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3615 
3616 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3617 	}
3618 	if (register_pernet_subsys(&tcp_sk_ops))
3619 		panic("Failed to create the TCP control socket.\n");
3620 
3621 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3622 	bpf_iter_register();
3623 #endif
3624 }
3625