xref: /linux/net/ipv4/tcp_ipv4.c (revision 4cde72fead4cebb5b6b2fe9425904c2064739184)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81 
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84 
85 #include <trace/events/tcp.h>
86 
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91 
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94 
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 
97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99 	return secure_tcp_seq(ip_hdr(skb)->daddr,
100 			      ip_hdr(skb)->saddr,
101 			      tcp_hdr(skb)->dest,
102 			      tcp_hdr(skb)->source);
103 }
104 
105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 {
107 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108 }
109 
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 	struct tcp_sock *tp = tcp_sk(sk);
116 
117 	if (reuse == 2) {
118 		/* Still does not detect *everything* that goes through
119 		 * lo, since we require a loopback src or dst address
120 		 * or direct binding to 'lo' interface.
121 		 */
122 		bool loopback = false;
123 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 			loopback = true;
125 #if IS_ENABLED(CONFIG_IPV6)
126 		if (tw->tw_family == AF_INET6) {
127 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131 				loopback = true;
132 		} else
133 #endif
134 		{
135 			if (ipv4_is_loopback(tw->tw_daddr) ||
136 			    ipv4_is_loopback(tw->tw_rcv_saddr))
137 				loopback = true;
138 		}
139 		if (!loopback)
140 			reuse = 0;
141 	}
142 
143 	/* With PAWS, it is safe from the viewpoint
144 	   of data integrity. Even without PAWS it is safe provided sequence
145 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 
147 	   Actually, the idea is close to VJ's one, only timestamp cache is
148 	   held not per host, but per port pair and TW bucket is used as state
149 	   holder.
150 
151 	   If TW bucket has been already destroyed we fall back to VJ's scheme
152 	   and use initial timestamp retrieved from peer table.
153 	 */
154 	if (tcptw->tw_ts_recent_stamp &&
155 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
156 					    tcptw->tw_ts_recent_stamp)))) {
157 		/* In case of repair and re-using TIME-WAIT sockets we still
158 		 * want to be sure that it is safe as above but honor the
159 		 * sequence numbers and time stamps set as part of the repair
160 		 * process.
161 		 *
162 		 * Without this check re-using a TIME-WAIT socket with TCP
163 		 * repair would accumulate a -1 on the repair assigned
164 		 * sequence number. The first time it is reused the sequence
165 		 * is -1, the second time -2, etc. This fixes that issue
166 		 * without appearing to create any others.
167 		 */
168 		if (likely(!tp->repair)) {
169 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170 
171 			if (!seq)
172 				seq = 1;
173 			WRITE_ONCE(tp->write_seq, seq);
174 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
175 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
176 		}
177 		sock_hold(sktw);
178 		return 1;
179 	}
180 
181 	return 0;
182 }
183 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 
185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
186 			      int addr_len)
187 {
188 	/* This check is replicated from tcp_v4_connect() and intended to
189 	 * prevent BPF program called below from accessing bytes that are out
190 	 * of the bound specified by user in addr_len.
191 	 */
192 	if (addr_len < sizeof(struct sockaddr_in))
193 		return -EINVAL;
194 
195 	sock_owned_by_me(sk);
196 
197 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
198 }
199 
200 /* This will initiate an outgoing connection. */
201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 {
203 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204 	struct inet_timewait_death_row *tcp_death_row;
205 	struct inet_sock *inet = inet_sk(sk);
206 	struct tcp_sock *tp = tcp_sk(sk);
207 	struct ip_options_rcu *inet_opt;
208 	struct net *net = sock_net(sk);
209 	__be16 orig_sport, orig_dport;
210 	__be32 daddr, nexthop;
211 	struct flowi4 *fl4;
212 	struct rtable *rt;
213 	int err;
214 
215 	if (addr_len < sizeof(struct sockaddr_in))
216 		return -EINVAL;
217 
218 	if (usin->sin_family != AF_INET)
219 		return -EAFNOSUPPORT;
220 
221 	nexthop = daddr = usin->sin_addr.s_addr;
222 	inet_opt = rcu_dereference_protected(inet->inet_opt,
223 					     lockdep_sock_is_held(sk));
224 	if (inet_opt && inet_opt->opt.srr) {
225 		if (!daddr)
226 			return -EINVAL;
227 		nexthop = inet_opt->opt.faddr;
228 	}
229 
230 	orig_sport = inet->inet_sport;
231 	orig_dport = usin->sin_port;
232 	fl4 = &inet->cork.fl.u.ip4;
233 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235 			      orig_dport, sk);
236 	if (IS_ERR(rt)) {
237 		err = PTR_ERR(rt);
238 		if (err == -ENETUNREACH)
239 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240 		return err;
241 	}
242 
243 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244 		ip_rt_put(rt);
245 		return -ENETUNREACH;
246 	}
247 
248 	if (!inet_opt || !inet_opt->opt.srr)
249 		daddr = fl4->daddr;
250 
251 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252 
253 	if (!inet->inet_saddr) {
254 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
255 		if (err) {
256 			ip_rt_put(rt);
257 			return err;
258 		}
259 	} else {
260 		sk_rcv_saddr_set(sk, inet->inet_saddr);
261 	}
262 
263 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264 		/* Reset inherited state */
265 		tp->rx_opt.ts_recent	   = 0;
266 		tp->rx_opt.ts_recent_stamp = 0;
267 		if (likely(!tp->repair))
268 			WRITE_ONCE(tp->write_seq, 0);
269 	}
270 
271 	inet->inet_dport = usin->sin_port;
272 	sk_daddr_set(sk, daddr);
273 
274 	inet_csk(sk)->icsk_ext_hdr_len = 0;
275 	if (inet_opt)
276 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277 
278 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279 
280 	/* Socket identity is still unknown (sport may be zero).
281 	 * However we set state to SYN-SENT and not releasing socket
282 	 * lock select source port, enter ourselves into the hash tables and
283 	 * complete initialization after this.
284 	 */
285 	tcp_set_state(sk, TCP_SYN_SENT);
286 	err = inet_hash_connect(tcp_death_row, sk);
287 	if (err)
288 		goto failure;
289 
290 	sk_set_txhash(sk);
291 
292 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
293 			       inet->inet_sport, inet->inet_dport, sk);
294 	if (IS_ERR(rt)) {
295 		err = PTR_ERR(rt);
296 		rt = NULL;
297 		goto failure;
298 	}
299 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
300 	/* OK, now commit destination to socket.  */
301 	sk->sk_gso_type = SKB_GSO_TCPV4;
302 	sk_setup_caps(sk, &rt->dst);
303 	rt = NULL;
304 
305 	if (likely(!tp->repair)) {
306 		if (!tp->write_seq)
307 			WRITE_ONCE(tp->write_seq,
308 				   secure_tcp_seq(inet->inet_saddr,
309 						  inet->inet_daddr,
310 						  inet->inet_sport,
311 						  usin->sin_port));
312 		WRITE_ONCE(tp->tsoffset,
313 			   secure_tcp_ts_off(net, inet->inet_saddr,
314 					     inet->inet_daddr));
315 	}
316 
317 	atomic_set(&inet->inet_id, get_random_u16());
318 
319 	if (tcp_fastopen_defer_connect(sk, &err))
320 		return err;
321 	if (err)
322 		goto failure;
323 
324 	err = tcp_connect(sk);
325 
326 	if (err)
327 		goto failure;
328 
329 	return 0;
330 
331 failure:
332 	/*
333 	 * This unhashes the socket and releases the local port,
334 	 * if necessary.
335 	 */
336 	tcp_set_state(sk, TCP_CLOSE);
337 	inet_bhash2_reset_saddr(sk);
338 	ip_rt_put(rt);
339 	sk->sk_route_caps = 0;
340 	inet->inet_dport = 0;
341 	return err;
342 }
343 EXPORT_SYMBOL(tcp_v4_connect);
344 
345 /*
346  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
347  * It can be called through tcp_release_cb() if socket was owned by user
348  * at the time tcp_v4_err() was called to handle ICMP message.
349  */
350 void tcp_v4_mtu_reduced(struct sock *sk)
351 {
352 	struct inet_sock *inet = inet_sk(sk);
353 	struct dst_entry *dst;
354 	u32 mtu;
355 
356 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
357 		return;
358 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
359 	dst = inet_csk_update_pmtu(sk, mtu);
360 	if (!dst)
361 		return;
362 
363 	/* Something is about to be wrong... Remember soft error
364 	 * for the case, if this connection will not able to recover.
365 	 */
366 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
367 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
368 
369 	mtu = dst_mtu(dst);
370 
371 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
372 	    ip_sk_accept_pmtu(sk) &&
373 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
374 		tcp_sync_mss(sk, mtu);
375 
376 		/* Resend the TCP packet because it's
377 		 * clear that the old packet has been
378 		 * dropped. This is the new "fast" path mtu
379 		 * discovery.
380 		 */
381 		tcp_simple_retransmit(sk);
382 	} /* else let the usual retransmit timer handle it */
383 }
384 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
385 
386 static void do_redirect(struct sk_buff *skb, struct sock *sk)
387 {
388 	struct dst_entry *dst = __sk_dst_check(sk, 0);
389 
390 	if (dst)
391 		dst->ops->redirect(dst, sk, skb);
392 }
393 
394 
395 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
396 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
397 {
398 	struct request_sock *req = inet_reqsk(sk);
399 	struct net *net = sock_net(sk);
400 
401 	/* ICMPs are not backlogged, hence we cannot get
402 	 * an established socket here.
403 	 */
404 	if (seq != tcp_rsk(req)->snt_isn) {
405 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
406 	} else if (abort) {
407 		/*
408 		 * Still in SYN_RECV, just remove it silently.
409 		 * There is no good way to pass the error to the newly
410 		 * created socket, and POSIX does not want network
411 		 * errors returned from accept().
412 		 */
413 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
414 		tcp_listendrop(req->rsk_listener);
415 	}
416 	reqsk_put(req);
417 }
418 EXPORT_SYMBOL(tcp_req_err);
419 
420 /* TCP-LD (RFC 6069) logic */
421 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
422 {
423 	struct inet_connection_sock *icsk = inet_csk(sk);
424 	struct tcp_sock *tp = tcp_sk(sk);
425 	struct sk_buff *skb;
426 	s32 remaining;
427 	u32 delta_us;
428 
429 	if (sock_owned_by_user(sk))
430 		return;
431 
432 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
433 	    !icsk->icsk_backoff)
434 		return;
435 
436 	skb = tcp_rtx_queue_head(sk);
437 	if (WARN_ON_ONCE(!skb))
438 		return;
439 
440 	icsk->icsk_backoff--;
441 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
442 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
443 
444 	tcp_mstamp_refresh(tp);
445 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
446 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
447 
448 	if (remaining > 0) {
449 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
450 					  remaining, TCP_RTO_MAX);
451 	} else {
452 		/* RTO revert clocked out retransmission.
453 		 * Will retransmit now.
454 		 */
455 		tcp_retransmit_timer(sk);
456 	}
457 }
458 EXPORT_SYMBOL(tcp_ld_RTO_revert);
459 
460 /*
461  * This routine is called by the ICMP module when it gets some
462  * sort of error condition.  If err < 0 then the socket should
463  * be closed and the error returned to the user.  If err > 0
464  * it's just the icmp type << 8 | icmp code.  After adjustment
465  * header points to the first 8 bytes of the tcp header.  We need
466  * to find the appropriate port.
467  *
468  * The locking strategy used here is very "optimistic". When
469  * someone else accesses the socket the ICMP is just dropped
470  * and for some paths there is no check at all.
471  * A more general error queue to queue errors for later handling
472  * is probably better.
473  *
474  */
475 
476 int tcp_v4_err(struct sk_buff *skb, u32 info)
477 {
478 	const struct iphdr *iph = (const struct iphdr *)skb->data;
479 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
480 	struct tcp_sock *tp;
481 	const int type = icmp_hdr(skb)->type;
482 	const int code = icmp_hdr(skb)->code;
483 	struct sock *sk;
484 	struct request_sock *fastopen;
485 	bool harderr = false;
486 	u32 seq, snd_una;
487 	int err;
488 	struct net *net = dev_net(skb->dev);
489 
490 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
491 				       iph->daddr, th->dest, iph->saddr,
492 				       ntohs(th->source), inet_iif(skb), 0);
493 	if (!sk) {
494 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
495 		return -ENOENT;
496 	}
497 	if (sk->sk_state == TCP_TIME_WAIT) {
498 		/* To increase the counter of ignored icmps for TCP-AO */
499 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
500 		inet_twsk_put(inet_twsk(sk));
501 		return 0;
502 	}
503 	seq = ntohl(th->seq);
504 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
505 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
506 				     type == ICMP_TIME_EXCEEDED ||
507 				     (type == ICMP_DEST_UNREACH &&
508 				      (code == ICMP_NET_UNREACH ||
509 				       code == ICMP_HOST_UNREACH)));
510 		return 0;
511 	}
512 
513 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
514 		sock_put(sk);
515 		return 0;
516 	}
517 
518 	bh_lock_sock(sk);
519 	/* If too many ICMPs get dropped on busy
520 	 * servers this needs to be solved differently.
521 	 * We do take care of PMTU discovery (RFC1191) special case :
522 	 * we can receive locally generated ICMP messages while socket is held.
523 	 */
524 	if (sock_owned_by_user(sk)) {
525 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
526 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
527 	}
528 	if (sk->sk_state == TCP_CLOSE)
529 		goto out;
530 
531 	if (static_branch_unlikely(&ip4_min_ttl)) {
532 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
533 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
534 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
535 			goto out;
536 		}
537 	}
538 
539 	tp = tcp_sk(sk);
540 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
541 	fastopen = rcu_dereference(tp->fastopen_rsk);
542 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
543 	if (sk->sk_state != TCP_LISTEN &&
544 	    !between(seq, snd_una, tp->snd_nxt)) {
545 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
546 		goto out;
547 	}
548 
549 	switch (type) {
550 	case ICMP_REDIRECT:
551 		if (!sock_owned_by_user(sk))
552 			do_redirect(skb, sk);
553 		goto out;
554 	case ICMP_SOURCE_QUENCH:
555 		/* Just silently ignore these. */
556 		goto out;
557 	case ICMP_PARAMETERPROB:
558 		err = EPROTO;
559 		harderr = true;
560 		break;
561 	case ICMP_DEST_UNREACH:
562 		if (code > NR_ICMP_UNREACH)
563 			goto out;
564 
565 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
566 			/* We are not interested in TCP_LISTEN and open_requests
567 			 * (SYN-ACKs send out by Linux are always <576bytes so
568 			 * they should go through unfragmented).
569 			 */
570 			if (sk->sk_state == TCP_LISTEN)
571 				goto out;
572 
573 			WRITE_ONCE(tp->mtu_info, info);
574 			if (!sock_owned_by_user(sk)) {
575 				tcp_v4_mtu_reduced(sk);
576 			} else {
577 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
578 					sock_hold(sk);
579 			}
580 			goto out;
581 		}
582 
583 		err = icmp_err_convert[code].errno;
584 		harderr = icmp_err_convert[code].fatal;
585 		/* check if this ICMP message allows revert of backoff.
586 		 * (see RFC 6069)
587 		 */
588 		if (!fastopen &&
589 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
590 			tcp_ld_RTO_revert(sk, seq);
591 		break;
592 	case ICMP_TIME_EXCEEDED:
593 		err = EHOSTUNREACH;
594 		break;
595 	default:
596 		goto out;
597 	}
598 
599 	switch (sk->sk_state) {
600 	case TCP_SYN_SENT:
601 	case TCP_SYN_RECV:
602 		/* Only in fast or simultaneous open. If a fast open socket is
603 		 * already accepted it is treated as a connected one below.
604 		 */
605 		if (fastopen && !fastopen->sk)
606 			break;
607 
608 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
609 
610 		if (!harderr)
611 			break;
612 
613 		if (!sock_owned_by_user(sk)) {
614 			WRITE_ONCE(sk->sk_err, err);
615 
616 			sk_error_report(sk);
617 
618 			tcp_done(sk);
619 		} else {
620 			WRITE_ONCE(sk->sk_err_soft, err);
621 		}
622 		goto out;
623 	}
624 
625 	/* If we've already connected we will keep trying
626 	 * until we time out, or the user gives up.
627 	 *
628 	 * rfc1122 4.2.3.9 allows to consider as hard errors
629 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
630 	 * but it is obsoleted by pmtu discovery).
631 	 *
632 	 * Note, that in modern internet, where routing is unreliable
633 	 * and in each dark corner broken firewalls sit, sending random
634 	 * errors ordered by their masters even this two messages finally lose
635 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
636 	 *
637 	 * Now we are in compliance with RFCs.
638 	 *							--ANK (980905)
639 	 */
640 
641 	if (!sock_owned_by_user(sk) &&
642 	    inet_test_bit(RECVERR, sk)) {
643 		WRITE_ONCE(sk->sk_err, err);
644 		sk_error_report(sk);
645 	} else	{ /* Only an error on timeout */
646 		WRITE_ONCE(sk->sk_err_soft, err);
647 	}
648 
649 out:
650 	bh_unlock_sock(sk);
651 	sock_put(sk);
652 	return 0;
653 }
654 
655 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
656 {
657 	struct tcphdr *th = tcp_hdr(skb);
658 
659 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
660 	skb->csum_start = skb_transport_header(skb) - skb->head;
661 	skb->csum_offset = offsetof(struct tcphdr, check);
662 }
663 
664 /* This routine computes an IPv4 TCP checksum. */
665 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
666 {
667 	const struct inet_sock *inet = inet_sk(sk);
668 
669 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
670 }
671 EXPORT_SYMBOL(tcp_v4_send_check);
672 
673 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
674 
675 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
676 				 const struct tcp_ao_hdr *aoh,
677 				 struct ip_reply_arg *arg, struct tcphdr *reply,
678 				 __be32 reply_options[REPLY_OPTIONS_LEN])
679 {
680 #ifdef CONFIG_TCP_AO
681 	int sdif = tcp_v4_sdif(skb);
682 	int dif = inet_iif(skb);
683 	int l3index = sdif ? dif : 0;
684 	bool allocated_traffic_key;
685 	struct tcp_ao_key *key;
686 	char *traffic_key;
687 	bool drop = true;
688 	u32 ao_sne = 0;
689 	u8 keyid;
690 
691 	rcu_read_lock();
692 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
693 				 &key, &traffic_key, &allocated_traffic_key,
694 				 &keyid, &ao_sne))
695 		goto out;
696 
697 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
698 				 (aoh->rnext_keyid << 8) | keyid);
699 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
700 	reply->doff = arg->iov[0].iov_len / 4;
701 
702 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
703 			    key, traffic_key,
704 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
705 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
706 			    reply, ao_sne))
707 		goto out;
708 	drop = false;
709 out:
710 	rcu_read_unlock();
711 	if (allocated_traffic_key)
712 		kfree(traffic_key);
713 	return drop;
714 #else
715 	return true;
716 #endif
717 }
718 
719 /*
720  *	This routine will send an RST to the other tcp.
721  *
722  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
723  *		      for reset.
724  *	Answer: if a packet caused RST, it is not for a socket
725  *		existing in our system, if it is matched to a socket,
726  *		it is just duplicate segment or bug in other side's TCP.
727  *		So that we build reply only basing on parameters
728  *		arrived with segment.
729  *	Exception: precedence violation. We do not implement it in any case.
730  */
731 
732 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
733 {
734 	const struct tcphdr *th = tcp_hdr(skb);
735 	struct {
736 		struct tcphdr th;
737 		__be32 opt[REPLY_OPTIONS_LEN];
738 	} rep;
739 	const __u8 *md5_hash_location = NULL;
740 	const struct tcp_ao_hdr *aoh;
741 	struct ip_reply_arg arg;
742 #ifdef CONFIG_TCP_MD5SIG
743 	struct tcp_md5sig_key *key = NULL;
744 	unsigned char newhash[16];
745 	struct sock *sk1 = NULL;
746 	int genhash;
747 #endif
748 	u64 transmit_time = 0;
749 	struct sock *ctl_sk;
750 	struct net *net;
751 	u32 txhash = 0;
752 
753 	/* Never send a reset in response to a reset. */
754 	if (th->rst)
755 		return;
756 
757 	/* If sk not NULL, it means we did a successful lookup and incoming
758 	 * route had to be correct. prequeue might have dropped our dst.
759 	 */
760 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
761 		return;
762 
763 	/* Swap the send and the receive. */
764 	memset(&rep, 0, sizeof(rep));
765 	rep.th.dest   = th->source;
766 	rep.th.source = th->dest;
767 	rep.th.doff   = sizeof(struct tcphdr) / 4;
768 	rep.th.rst    = 1;
769 
770 	if (th->ack) {
771 		rep.th.seq = th->ack_seq;
772 	} else {
773 		rep.th.ack = 1;
774 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
775 				       skb->len - (th->doff << 2));
776 	}
777 
778 	memset(&arg, 0, sizeof(arg));
779 	arg.iov[0].iov_base = (unsigned char *)&rep;
780 	arg.iov[0].iov_len  = sizeof(rep.th);
781 
782 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
783 
784 	/* Invalid TCP option size or twice included auth */
785 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
786 		return;
787 
788 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
789 		return;
790 
791 #ifdef CONFIG_TCP_MD5SIG
792 	rcu_read_lock();
793 	if (sk && sk_fullsock(sk)) {
794 		const union tcp_md5_addr *addr;
795 		int l3index;
796 
797 		/* sdif set, means packet ingressed via a device
798 		 * in an L3 domain and inet_iif is set to it.
799 		 */
800 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
801 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
802 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
803 	} else if (md5_hash_location) {
804 		const union tcp_md5_addr *addr;
805 		int sdif = tcp_v4_sdif(skb);
806 		int dif = inet_iif(skb);
807 		int l3index;
808 
809 		/*
810 		 * active side is lost. Try to find listening socket through
811 		 * source port, and then find md5 key through listening socket.
812 		 * we are not loose security here:
813 		 * Incoming packet is checked with md5 hash with finding key,
814 		 * no RST generated if md5 hash doesn't match.
815 		 */
816 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
817 					     NULL, 0, ip_hdr(skb)->saddr,
818 					     th->source, ip_hdr(skb)->daddr,
819 					     ntohs(th->source), dif, sdif);
820 		/* don't send rst if it can't find key */
821 		if (!sk1)
822 			goto out;
823 
824 		/* sdif set, means packet ingressed via a device
825 		 * in an L3 domain and dif is set to it.
826 		 */
827 		l3index = sdif ? dif : 0;
828 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
829 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
830 		if (!key)
831 			goto out;
832 
833 
834 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
835 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
836 			goto out;
837 
838 	}
839 
840 	if (key) {
841 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
842 				   (TCPOPT_NOP << 16) |
843 				   (TCPOPT_MD5SIG << 8) |
844 				   TCPOLEN_MD5SIG);
845 		/* Update length and the length the header thinks exists */
846 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
847 		rep.th.doff = arg.iov[0].iov_len / 4;
848 
849 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
850 				     key, ip_hdr(skb)->saddr,
851 				     ip_hdr(skb)->daddr, &rep.th);
852 	}
853 #endif
854 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
855 	if (rep.opt[0] == 0) {
856 		__be32 mrst = mptcp_reset_option(skb);
857 
858 		if (mrst) {
859 			rep.opt[0] = mrst;
860 			arg.iov[0].iov_len += sizeof(mrst);
861 			rep.th.doff = arg.iov[0].iov_len / 4;
862 		}
863 	}
864 
865 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
866 				      ip_hdr(skb)->saddr, /* XXX */
867 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
868 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
869 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
870 
871 	/* When socket is gone, all binding information is lost.
872 	 * routing might fail in this case. No choice here, if we choose to force
873 	 * input interface, we will misroute in case of asymmetric route.
874 	 */
875 	if (sk) {
876 		arg.bound_dev_if = sk->sk_bound_dev_if;
877 		if (sk_fullsock(sk))
878 			trace_tcp_send_reset(sk, skb);
879 	}
880 
881 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
882 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
883 
884 	arg.tos = ip_hdr(skb)->tos;
885 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
886 	local_bh_disable();
887 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
888 	sock_net_set(ctl_sk, net);
889 	if (sk) {
890 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
891 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
892 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
893 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
894 		transmit_time = tcp_transmit_time(sk);
895 		xfrm_sk_clone_policy(ctl_sk, sk);
896 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
897 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
898 	} else {
899 		ctl_sk->sk_mark = 0;
900 		ctl_sk->sk_priority = 0;
901 	}
902 	ip_send_unicast_reply(ctl_sk,
903 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
904 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
905 			      &arg, arg.iov[0].iov_len,
906 			      transmit_time, txhash);
907 
908 	xfrm_sk_free_policy(ctl_sk);
909 	sock_net_set(ctl_sk, &init_net);
910 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
911 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
912 	local_bh_enable();
913 
914 #ifdef CONFIG_TCP_MD5SIG
915 out:
916 	rcu_read_unlock();
917 #endif
918 }
919 
920 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
921    outside socket context is ugly, certainly. What can I do?
922  */
923 
924 static void tcp_v4_send_ack(const struct sock *sk,
925 			    struct sk_buff *skb, u32 seq, u32 ack,
926 			    u32 win, u32 tsval, u32 tsecr, int oif,
927 			    struct tcp_key *key,
928 			    int reply_flags, u8 tos, u32 txhash)
929 {
930 	const struct tcphdr *th = tcp_hdr(skb);
931 	struct {
932 		struct tcphdr th;
933 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
934 	} rep;
935 	struct net *net = sock_net(sk);
936 	struct ip_reply_arg arg;
937 	struct sock *ctl_sk;
938 	u64 transmit_time;
939 
940 	memset(&rep.th, 0, sizeof(struct tcphdr));
941 	memset(&arg, 0, sizeof(arg));
942 
943 	arg.iov[0].iov_base = (unsigned char *)&rep;
944 	arg.iov[0].iov_len  = sizeof(rep.th);
945 	if (tsecr) {
946 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
947 				   (TCPOPT_TIMESTAMP << 8) |
948 				   TCPOLEN_TIMESTAMP);
949 		rep.opt[1] = htonl(tsval);
950 		rep.opt[2] = htonl(tsecr);
951 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
952 	}
953 
954 	/* Swap the send and the receive. */
955 	rep.th.dest    = th->source;
956 	rep.th.source  = th->dest;
957 	rep.th.doff    = arg.iov[0].iov_len / 4;
958 	rep.th.seq     = htonl(seq);
959 	rep.th.ack_seq = htonl(ack);
960 	rep.th.ack     = 1;
961 	rep.th.window  = htons(win);
962 
963 #ifdef CONFIG_TCP_MD5SIG
964 	if (tcp_key_is_md5(key)) {
965 		int offset = (tsecr) ? 3 : 0;
966 
967 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
968 					  (TCPOPT_NOP << 16) |
969 					  (TCPOPT_MD5SIG << 8) |
970 					  TCPOLEN_MD5SIG);
971 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
972 		rep.th.doff = arg.iov[0].iov_len/4;
973 
974 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
975 				    key->md5_key, ip_hdr(skb)->saddr,
976 				    ip_hdr(skb)->daddr, &rep.th);
977 	}
978 #endif
979 #ifdef CONFIG_TCP_AO
980 	if (tcp_key_is_ao(key)) {
981 		int offset = (tsecr) ? 3 : 0;
982 
983 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
984 					  (tcp_ao_len(key->ao_key) << 16) |
985 					  (key->ao_key->sndid << 8) |
986 					  key->rcv_next);
987 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
988 		rep.th.doff = arg.iov[0].iov_len / 4;
989 
990 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
991 				key->ao_key, key->traffic_key,
992 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
993 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
994 				&rep.th, key->sne);
995 	}
996 #endif
997 	arg.flags = reply_flags;
998 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
999 				      ip_hdr(skb)->saddr, /* XXX */
1000 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1001 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1002 	if (oif)
1003 		arg.bound_dev_if = oif;
1004 	arg.tos = tos;
1005 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1006 	local_bh_disable();
1007 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
1008 	sock_net_set(ctl_sk, net);
1009 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1010 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1011 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1012 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1013 	transmit_time = tcp_transmit_time(sk);
1014 	ip_send_unicast_reply(ctl_sk,
1015 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1016 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1017 			      &arg, arg.iov[0].iov_len,
1018 			      transmit_time, txhash);
1019 
1020 	sock_net_set(ctl_sk, &init_net);
1021 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1022 	local_bh_enable();
1023 }
1024 
1025 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1026 {
1027 	struct inet_timewait_sock *tw = inet_twsk(sk);
1028 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1029 	struct tcp_key key = {};
1030 #ifdef CONFIG_TCP_AO
1031 	struct tcp_ao_info *ao_info;
1032 
1033 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1034 		/* FIXME: the segment to-be-acked is not verified yet */
1035 		ao_info = rcu_dereference(tcptw->ao_info);
1036 		if (ao_info) {
1037 			const struct tcp_ao_hdr *aoh;
1038 
1039 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1040 				inet_twsk_put(tw);
1041 				return;
1042 			}
1043 
1044 			if (aoh)
1045 				key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1046 		}
1047 	}
1048 	if (key.ao_key) {
1049 		struct tcp_ao_key *rnext_key;
1050 
1051 		key.traffic_key = snd_other_key(key.ao_key);
1052 		key.sne = READ_ONCE(ao_info->snd_sne);
1053 		rnext_key = READ_ONCE(ao_info->rnext_key);
1054 		key.rcv_next = rnext_key->rcvid;
1055 		key.type = TCP_KEY_AO;
1056 #else
1057 	if (0) {
1058 #endif
1059 #ifdef CONFIG_TCP_MD5SIG
1060 	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1061 		key.md5_key = tcp_twsk_md5_key(tcptw);
1062 		if (key.md5_key)
1063 			key.type = TCP_KEY_MD5;
1064 #endif
1065 	}
1066 
1067 	tcp_v4_send_ack(sk, skb,
1068 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
1069 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1070 			tcp_tw_tsval(tcptw),
1071 			tcptw->tw_ts_recent,
1072 			tw->tw_bound_dev_if, &key,
1073 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1074 			tw->tw_tos,
1075 			tw->tw_txhash);
1076 
1077 	inet_twsk_put(tw);
1078 }
1079 
1080 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1081 				  struct request_sock *req)
1082 {
1083 	struct tcp_key key = {};
1084 
1085 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1086 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1087 	 */
1088 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1089 					     tcp_sk(sk)->snd_nxt;
1090 
1091 #ifdef CONFIG_TCP_AO
1092 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1093 	    tcp_rsk_used_ao(req)) {
1094 		const union tcp_md5_addr *addr;
1095 		const struct tcp_ao_hdr *aoh;
1096 		int l3index;
1097 
1098 		/* Invalid TCP option size or twice included auth */
1099 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1100 			return;
1101 		if (!aoh)
1102 			return;
1103 
1104 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1105 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1106 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1107 					      aoh->rnext_keyid, -1);
1108 		if (unlikely(!key.ao_key)) {
1109 			/* Send ACK with any matching MKT for the peer */
1110 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1111 			/* Matching key disappeared (user removed the key?)
1112 			 * let the handshake timeout.
1113 			 */
1114 			if (!key.ao_key) {
1115 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1116 						     addr,
1117 						     ntohs(tcp_hdr(skb)->source),
1118 						     &ip_hdr(skb)->daddr,
1119 						     ntohs(tcp_hdr(skb)->dest));
1120 				return;
1121 			}
1122 		}
1123 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1124 		if (!key.traffic_key)
1125 			return;
1126 
1127 		key.type = TCP_KEY_AO;
1128 		key.rcv_next = aoh->keyid;
1129 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1130 #else
1131 	if (0) {
1132 #endif
1133 #ifdef CONFIG_TCP_MD5SIG
1134 	} else if (static_branch_unlikely(&tcp_md5_needed.key)) {
1135 		const union tcp_md5_addr *addr;
1136 		int l3index;
1137 
1138 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1139 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1140 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1141 		if (key.md5_key)
1142 			key.type = TCP_KEY_MD5;
1143 #endif
1144 	}
1145 
1146 	/* RFC 7323 2.3
1147 	 * The window field (SEG.WND) of every outgoing segment, with the
1148 	 * exception of <SYN> segments, MUST be right-shifted by
1149 	 * Rcv.Wind.Shift bits:
1150 	 */
1151 	tcp_v4_send_ack(sk, skb, seq,
1152 			tcp_rsk(req)->rcv_nxt,
1153 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
1154 			tcp_rsk_tsval(tcp_rsk(req)),
1155 			READ_ONCE(req->ts_recent),
1156 			0, &key,
1157 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1158 			ip_hdr(skb)->tos,
1159 			READ_ONCE(tcp_rsk(req)->txhash));
1160 	if (tcp_key_is_ao(&key))
1161 		kfree(key.traffic_key);
1162 }
1163 
1164 /*
1165  *	Send a SYN-ACK after having received a SYN.
1166  *	This still operates on a request_sock only, not on a big
1167  *	socket.
1168  */
1169 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1170 			      struct flowi *fl,
1171 			      struct request_sock *req,
1172 			      struct tcp_fastopen_cookie *foc,
1173 			      enum tcp_synack_type synack_type,
1174 			      struct sk_buff *syn_skb)
1175 {
1176 	const struct inet_request_sock *ireq = inet_rsk(req);
1177 	struct flowi4 fl4;
1178 	int err = -1;
1179 	struct sk_buff *skb;
1180 	u8 tos;
1181 
1182 	/* First, grab a route. */
1183 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1184 		return -1;
1185 
1186 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1187 
1188 	if (skb) {
1189 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1190 
1191 		tos = READ_ONCE(inet_sk(sk)->tos);
1192 
1193 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1194 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1195 			      (tos & INET_ECN_MASK);
1196 
1197 		if (!INET_ECN_is_capable(tos) &&
1198 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1199 			tos |= INET_ECN_ECT_0;
1200 
1201 		rcu_read_lock();
1202 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1203 					    ireq->ir_rmt_addr,
1204 					    rcu_dereference(ireq->ireq_opt),
1205 					    tos);
1206 		rcu_read_unlock();
1207 		err = net_xmit_eval(err);
1208 	}
1209 
1210 	return err;
1211 }
1212 
1213 /*
1214  *	IPv4 request_sock destructor.
1215  */
1216 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1217 {
1218 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1219 }
1220 
1221 #ifdef CONFIG_TCP_MD5SIG
1222 /*
1223  * RFC2385 MD5 checksumming requires a mapping of
1224  * IP address->MD5 Key.
1225  * We need to maintain these in the sk structure.
1226  */
1227 
1228 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1229 EXPORT_SYMBOL(tcp_md5_needed);
1230 
1231 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1232 {
1233 	if (!old)
1234 		return true;
1235 
1236 	/* l3index always overrides non-l3index */
1237 	if (old->l3index && new->l3index == 0)
1238 		return false;
1239 	if (old->l3index == 0 && new->l3index)
1240 		return true;
1241 
1242 	return old->prefixlen < new->prefixlen;
1243 }
1244 
1245 /* Find the Key structure for an address.  */
1246 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1247 					   const union tcp_md5_addr *addr,
1248 					   int family, bool any_l3index)
1249 {
1250 	const struct tcp_sock *tp = tcp_sk(sk);
1251 	struct tcp_md5sig_key *key;
1252 	const struct tcp_md5sig_info *md5sig;
1253 	__be32 mask;
1254 	struct tcp_md5sig_key *best_match = NULL;
1255 	bool match;
1256 
1257 	/* caller either holds rcu_read_lock() or socket lock */
1258 	md5sig = rcu_dereference_check(tp->md5sig_info,
1259 				       lockdep_sock_is_held(sk));
1260 	if (!md5sig)
1261 		return NULL;
1262 
1263 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1264 				 lockdep_sock_is_held(sk)) {
1265 		if (key->family != family)
1266 			continue;
1267 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1268 		    key->l3index != l3index)
1269 			continue;
1270 		if (family == AF_INET) {
1271 			mask = inet_make_mask(key->prefixlen);
1272 			match = (key->addr.a4.s_addr & mask) ==
1273 				(addr->a4.s_addr & mask);
1274 #if IS_ENABLED(CONFIG_IPV6)
1275 		} else if (family == AF_INET6) {
1276 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1277 						  key->prefixlen);
1278 #endif
1279 		} else {
1280 			match = false;
1281 		}
1282 
1283 		if (match && better_md5_match(best_match, key))
1284 			best_match = key;
1285 	}
1286 	return best_match;
1287 }
1288 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1289 
1290 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1291 						      const union tcp_md5_addr *addr,
1292 						      int family, u8 prefixlen,
1293 						      int l3index, u8 flags)
1294 {
1295 	const struct tcp_sock *tp = tcp_sk(sk);
1296 	struct tcp_md5sig_key *key;
1297 	unsigned int size = sizeof(struct in_addr);
1298 	const struct tcp_md5sig_info *md5sig;
1299 
1300 	/* caller either holds rcu_read_lock() or socket lock */
1301 	md5sig = rcu_dereference_check(tp->md5sig_info,
1302 				       lockdep_sock_is_held(sk));
1303 	if (!md5sig)
1304 		return NULL;
1305 #if IS_ENABLED(CONFIG_IPV6)
1306 	if (family == AF_INET6)
1307 		size = sizeof(struct in6_addr);
1308 #endif
1309 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1310 				 lockdep_sock_is_held(sk)) {
1311 		if (key->family != family)
1312 			continue;
1313 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1314 			continue;
1315 		if (key->l3index != l3index)
1316 			continue;
1317 		if (!memcmp(&key->addr, addr, size) &&
1318 		    key->prefixlen == prefixlen)
1319 			return key;
1320 	}
1321 	return NULL;
1322 }
1323 
1324 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1325 					 const struct sock *addr_sk)
1326 {
1327 	const union tcp_md5_addr *addr;
1328 	int l3index;
1329 
1330 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1331 						 addr_sk->sk_bound_dev_if);
1332 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1333 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1334 }
1335 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1336 
1337 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1338 {
1339 	struct tcp_sock *tp = tcp_sk(sk);
1340 	struct tcp_md5sig_info *md5sig;
1341 
1342 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1343 	if (!md5sig)
1344 		return -ENOMEM;
1345 
1346 	sk_gso_disable(sk);
1347 	INIT_HLIST_HEAD(&md5sig->head);
1348 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1349 	return 0;
1350 }
1351 
1352 /* This can be called on a newly created socket, from other files */
1353 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1354 			    int family, u8 prefixlen, int l3index, u8 flags,
1355 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1356 {
1357 	/* Add Key to the list */
1358 	struct tcp_md5sig_key *key;
1359 	struct tcp_sock *tp = tcp_sk(sk);
1360 	struct tcp_md5sig_info *md5sig;
1361 
1362 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1363 	if (key) {
1364 		/* Pre-existing entry - just update that one.
1365 		 * Note that the key might be used concurrently.
1366 		 * data_race() is telling kcsan that we do not care of
1367 		 * key mismatches, since changing MD5 key on live flows
1368 		 * can lead to packet drops.
1369 		 */
1370 		data_race(memcpy(key->key, newkey, newkeylen));
1371 
1372 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1373 		 * Also note that a reader could catch new key->keylen value
1374 		 * but old key->key[], this is the reason we use __GFP_ZERO
1375 		 * at sock_kmalloc() time below these lines.
1376 		 */
1377 		WRITE_ONCE(key->keylen, newkeylen);
1378 
1379 		return 0;
1380 	}
1381 
1382 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1383 					   lockdep_sock_is_held(sk));
1384 
1385 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1386 	if (!key)
1387 		return -ENOMEM;
1388 
1389 	memcpy(key->key, newkey, newkeylen);
1390 	key->keylen = newkeylen;
1391 	key->family = family;
1392 	key->prefixlen = prefixlen;
1393 	key->l3index = l3index;
1394 	key->flags = flags;
1395 	memcpy(&key->addr, addr,
1396 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1397 								 sizeof(struct in_addr));
1398 	hlist_add_head_rcu(&key->node, &md5sig->head);
1399 	return 0;
1400 }
1401 
1402 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1403 		   int family, u8 prefixlen, int l3index, u8 flags,
1404 		   const u8 *newkey, u8 newkeylen)
1405 {
1406 	struct tcp_sock *tp = tcp_sk(sk);
1407 
1408 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1409 		if (tcp_md5_alloc_sigpool())
1410 			return -ENOMEM;
1411 
1412 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1413 			tcp_md5_release_sigpool();
1414 			return -ENOMEM;
1415 		}
1416 
1417 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1418 			struct tcp_md5sig_info *md5sig;
1419 
1420 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1421 			rcu_assign_pointer(tp->md5sig_info, NULL);
1422 			kfree_rcu(md5sig, rcu);
1423 			tcp_md5_release_sigpool();
1424 			return -EUSERS;
1425 		}
1426 	}
1427 
1428 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1429 				newkey, newkeylen, GFP_KERNEL);
1430 }
1431 EXPORT_SYMBOL(tcp_md5_do_add);
1432 
1433 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1434 		     int family, u8 prefixlen, int l3index,
1435 		     struct tcp_md5sig_key *key)
1436 {
1437 	struct tcp_sock *tp = tcp_sk(sk);
1438 
1439 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1440 		tcp_md5_add_sigpool();
1441 
1442 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1443 			tcp_md5_release_sigpool();
1444 			return -ENOMEM;
1445 		}
1446 
1447 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1448 			struct tcp_md5sig_info *md5sig;
1449 
1450 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1451 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1452 			rcu_assign_pointer(tp->md5sig_info, NULL);
1453 			kfree_rcu(md5sig, rcu);
1454 			tcp_md5_release_sigpool();
1455 			return -EUSERS;
1456 		}
1457 	}
1458 
1459 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1460 				key->flags, key->key, key->keylen,
1461 				sk_gfp_mask(sk, GFP_ATOMIC));
1462 }
1463 EXPORT_SYMBOL(tcp_md5_key_copy);
1464 
1465 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1466 		   u8 prefixlen, int l3index, u8 flags)
1467 {
1468 	struct tcp_md5sig_key *key;
1469 
1470 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1471 	if (!key)
1472 		return -ENOENT;
1473 	hlist_del_rcu(&key->node);
1474 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1475 	kfree_rcu(key, rcu);
1476 	return 0;
1477 }
1478 EXPORT_SYMBOL(tcp_md5_do_del);
1479 
1480 void tcp_clear_md5_list(struct sock *sk)
1481 {
1482 	struct tcp_sock *tp = tcp_sk(sk);
1483 	struct tcp_md5sig_key *key;
1484 	struct hlist_node *n;
1485 	struct tcp_md5sig_info *md5sig;
1486 
1487 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1488 
1489 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1490 		hlist_del_rcu(&key->node);
1491 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1492 		kfree_rcu(key, rcu);
1493 	}
1494 }
1495 
1496 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1497 				 sockptr_t optval, int optlen)
1498 {
1499 	struct tcp_md5sig cmd;
1500 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1501 	const union tcp_md5_addr *addr;
1502 	u8 prefixlen = 32;
1503 	int l3index = 0;
1504 	bool l3flag;
1505 	u8 flags;
1506 
1507 	if (optlen < sizeof(cmd))
1508 		return -EINVAL;
1509 
1510 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1511 		return -EFAULT;
1512 
1513 	if (sin->sin_family != AF_INET)
1514 		return -EINVAL;
1515 
1516 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1517 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1518 
1519 	if (optname == TCP_MD5SIG_EXT &&
1520 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1521 		prefixlen = cmd.tcpm_prefixlen;
1522 		if (prefixlen > 32)
1523 			return -EINVAL;
1524 	}
1525 
1526 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1527 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1528 		struct net_device *dev;
1529 
1530 		rcu_read_lock();
1531 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1532 		if (dev && netif_is_l3_master(dev))
1533 			l3index = dev->ifindex;
1534 
1535 		rcu_read_unlock();
1536 
1537 		/* ok to reference set/not set outside of rcu;
1538 		 * right now device MUST be an L3 master
1539 		 */
1540 		if (!dev || !l3index)
1541 			return -EINVAL;
1542 	}
1543 
1544 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1545 
1546 	if (!cmd.tcpm_keylen)
1547 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1548 
1549 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1550 		return -EINVAL;
1551 
1552 	/* Don't allow keys for peers that have a matching TCP-AO key.
1553 	 * See the comment in tcp_ao_add_cmd()
1554 	 */
1555 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1556 		return -EKEYREJECTED;
1557 
1558 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1559 			      cmd.tcpm_key, cmd.tcpm_keylen);
1560 }
1561 
1562 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1563 				   __be32 daddr, __be32 saddr,
1564 				   const struct tcphdr *th, int nbytes)
1565 {
1566 	struct tcp4_pseudohdr *bp;
1567 	struct scatterlist sg;
1568 	struct tcphdr *_th;
1569 
1570 	bp = hp->scratch;
1571 	bp->saddr = saddr;
1572 	bp->daddr = daddr;
1573 	bp->pad = 0;
1574 	bp->protocol = IPPROTO_TCP;
1575 	bp->len = cpu_to_be16(nbytes);
1576 
1577 	_th = (struct tcphdr *)(bp + 1);
1578 	memcpy(_th, th, sizeof(*th));
1579 	_th->check = 0;
1580 
1581 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1582 	ahash_request_set_crypt(hp->req, &sg, NULL,
1583 				sizeof(*bp) + sizeof(*th));
1584 	return crypto_ahash_update(hp->req);
1585 }
1586 
1587 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1588 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1589 {
1590 	struct tcp_sigpool hp;
1591 
1592 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1593 		goto clear_hash_nostart;
1594 
1595 	if (crypto_ahash_init(hp.req))
1596 		goto clear_hash;
1597 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1598 		goto clear_hash;
1599 	if (tcp_md5_hash_key(&hp, key))
1600 		goto clear_hash;
1601 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1602 	if (crypto_ahash_final(hp.req))
1603 		goto clear_hash;
1604 
1605 	tcp_sigpool_end(&hp);
1606 	return 0;
1607 
1608 clear_hash:
1609 	tcp_sigpool_end(&hp);
1610 clear_hash_nostart:
1611 	memset(md5_hash, 0, 16);
1612 	return 1;
1613 }
1614 
1615 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1616 			const struct sock *sk,
1617 			const struct sk_buff *skb)
1618 {
1619 	const struct tcphdr *th = tcp_hdr(skb);
1620 	struct tcp_sigpool hp;
1621 	__be32 saddr, daddr;
1622 
1623 	if (sk) { /* valid for establish/request sockets */
1624 		saddr = sk->sk_rcv_saddr;
1625 		daddr = sk->sk_daddr;
1626 	} else {
1627 		const struct iphdr *iph = ip_hdr(skb);
1628 		saddr = iph->saddr;
1629 		daddr = iph->daddr;
1630 	}
1631 
1632 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1633 		goto clear_hash_nostart;
1634 
1635 	if (crypto_ahash_init(hp.req))
1636 		goto clear_hash;
1637 
1638 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1639 		goto clear_hash;
1640 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1641 		goto clear_hash;
1642 	if (tcp_md5_hash_key(&hp, key))
1643 		goto clear_hash;
1644 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1645 	if (crypto_ahash_final(hp.req))
1646 		goto clear_hash;
1647 
1648 	tcp_sigpool_end(&hp);
1649 	return 0;
1650 
1651 clear_hash:
1652 	tcp_sigpool_end(&hp);
1653 clear_hash_nostart:
1654 	memset(md5_hash, 0, 16);
1655 	return 1;
1656 }
1657 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1658 
1659 #endif
1660 
1661 static void tcp_v4_init_req(struct request_sock *req,
1662 			    const struct sock *sk_listener,
1663 			    struct sk_buff *skb)
1664 {
1665 	struct inet_request_sock *ireq = inet_rsk(req);
1666 	struct net *net = sock_net(sk_listener);
1667 
1668 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1669 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1670 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1671 }
1672 
1673 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1674 					  struct sk_buff *skb,
1675 					  struct flowi *fl,
1676 					  struct request_sock *req)
1677 {
1678 	tcp_v4_init_req(req, sk, skb);
1679 
1680 	if (security_inet_conn_request(sk, skb, req))
1681 		return NULL;
1682 
1683 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1684 }
1685 
1686 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1687 	.family		=	PF_INET,
1688 	.obj_size	=	sizeof(struct tcp_request_sock),
1689 	.rtx_syn_ack	=	tcp_rtx_synack,
1690 	.send_ack	=	tcp_v4_reqsk_send_ack,
1691 	.destructor	=	tcp_v4_reqsk_destructor,
1692 	.send_reset	=	tcp_v4_send_reset,
1693 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1694 };
1695 
1696 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1697 	.mss_clamp	=	TCP_MSS_DEFAULT,
1698 #ifdef CONFIG_TCP_MD5SIG
1699 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1700 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1701 #endif
1702 #ifdef CONFIG_TCP_AO
1703 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1704 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1705 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1706 #endif
1707 #ifdef CONFIG_SYN_COOKIES
1708 	.cookie_init_seq =	cookie_v4_init_sequence,
1709 #endif
1710 	.route_req	=	tcp_v4_route_req,
1711 	.init_seq	=	tcp_v4_init_seq,
1712 	.init_ts_off	=	tcp_v4_init_ts_off,
1713 	.send_synack	=	tcp_v4_send_synack,
1714 };
1715 
1716 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1717 {
1718 	/* Never answer to SYNs send to broadcast or multicast */
1719 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1720 		goto drop;
1721 
1722 	return tcp_conn_request(&tcp_request_sock_ops,
1723 				&tcp_request_sock_ipv4_ops, sk, skb);
1724 
1725 drop:
1726 	tcp_listendrop(sk);
1727 	return 0;
1728 }
1729 EXPORT_SYMBOL(tcp_v4_conn_request);
1730 
1731 
1732 /*
1733  * The three way handshake has completed - we got a valid synack -
1734  * now create the new socket.
1735  */
1736 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1737 				  struct request_sock *req,
1738 				  struct dst_entry *dst,
1739 				  struct request_sock *req_unhash,
1740 				  bool *own_req)
1741 {
1742 	struct inet_request_sock *ireq;
1743 	bool found_dup_sk = false;
1744 	struct inet_sock *newinet;
1745 	struct tcp_sock *newtp;
1746 	struct sock *newsk;
1747 #ifdef CONFIG_TCP_MD5SIG
1748 	const union tcp_md5_addr *addr;
1749 	struct tcp_md5sig_key *key;
1750 	int l3index;
1751 #endif
1752 	struct ip_options_rcu *inet_opt;
1753 
1754 	if (sk_acceptq_is_full(sk))
1755 		goto exit_overflow;
1756 
1757 	newsk = tcp_create_openreq_child(sk, req, skb);
1758 	if (!newsk)
1759 		goto exit_nonewsk;
1760 
1761 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1762 	inet_sk_rx_dst_set(newsk, skb);
1763 
1764 	newtp		      = tcp_sk(newsk);
1765 	newinet		      = inet_sk(newsk);
1766 	ireq		      = inet_rsk(req);
1767 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1768 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1769 	newsk->sk_bound_dev_if = ireq->ir_iif;
1770 	newinet->inet_saddr   = ireq->ir_loc_addr;
1771 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1772 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1773 	newinet->mc_index     = inet_iif(skb);
1774 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1775 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1776 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1777 	if (inet_opt)
1778 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1779 	atomic_set(&newinet->inet_id, get_random_u16());
1780 
1781 	/* Set ToS of the new socket based upon the value of incoming SYN.
1782 	 * ECT bits are set later in tcp_init_transfer().
1783 	 */
1784 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1785 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1786 
1787 	if (!dst) {
1788 		dst = inet_csk_route_child_sock(sk, newsk, req);
1789 		if (!dst)
1790 			goto put_and_exit;
1791 	} else {
1792 		/* syncookie case : see end of cookie_v4_check() */
1793 	}
1794 	sk_setup_caps(newsk, dst);
1795 
1796 	tcp_ca_openreq_child(newsk, dst);
1797 
1798 	tcp_sync_mss(newsk, dst_mtu(dst));
1799 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1800 
1801 	tcp_initialize_rcv_mss(newsk);
1802 
1803 #ifdef CONFIG_TCP_MD5SIG
1804 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1805 	/* Copy over the MD5 key from the original socket */
1806 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1807 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1808 	if (key && !tcp_rsk_used_ao(req)) {
1809 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1810 			goto put_and_exit;
1811 		sk_gso_disable(newsk);
1812 	}
1813 #endif
1814 #ifdef CONFIG_TCP_AO
1815 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1816 		goto put_and_exit; /* OOM, release back memory */
1817 #endif
1818 
1819 	if (__inet_inherit_port(sk, newsk) < 0)
1820 		goto put_and_exit;
1821 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1822 				       &found_dup_sk);
1823 	if (likely(*own_req)) {
1824 		tcp_move_syn(newtp, req);
1825 		ireq->ireq_opt = NULL;
1826 	} else {
1827 		newinet->inet_opt = NULL;
1828 
1829 		if (!req_unhash && found_dup_sk) {
1830 			/* This code path should only be executed in the
1831 			 * syncookie case only
1832 			 */
1833 			bh_unlock_sock(newsk);
1834 			sock_put(newsk);
1835 			newsk = NULL;
1836 		}
1837 	}
1838 	return newsk;
1839 
1840 exit_overflow:
1841 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1842 exit_nonewsk:
1843 	dst_release(dst);
1844 exit:
1845 	tcp_listendrop(sk);
1846 	return NULL;
1847 put_and_exit:
1848 	newinet->inet_opt = NULL;
1849 	inet_csk_prepare_forced_close(newsk);
1850 	tcp_done(newsk);
1851 	goto exit;
1852 }
1853 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1854 
1855 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1856 {
1857 #ifdef CONFIG_SYN_COOKIES
1858 	const struct tcphdr *th = tcp_hdr(skb);
1859 
1860 	if (!th->syn)
1861 		sk = cookie_v4_check(sk, skb);
1862 #endif
1863 	return sk;
1864 }
1865 
1866 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1867 			 struct tcphdr *th, u32 *cookie)
1868 {
1869 	u16 mss = 0;
1870 #ifdef CONFIG_SYN_COOKIES
1871 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1872 				    &tcp_request_sock_ipv4_ops, sk, th);
1873 	if (mss) {
1874 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1875 		tcp_synq_overflow(sk);
1876 	}
1877 #endif
1878 	return mss;
1879 }
1880 
1881 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1882 							   u32));
1883 /* The socket must have it's spinlock held when we get
1884  * here, unless it is a TCP_LISTEN socket.
1885  *
1886  * We have a potential double-lock case here, so even when
1887  * doing backlog processing we use the BH locking scheme.
1888  * This is because we cannot sleep with the original spinlock
1889  * held.
1890  */
1891 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1892 {
1893 	enum skb_drop_reason reason;
1894 	struct sock *rsk;
1895 
1896 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1897 		struct dst_entry *dst;
1898 
1899 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1900 						lockdep_sock_is_held(sk));
1901 
1902 		sock_rps_save_rxhash(sk, skb);
1903 		sk_mark_napi_id(sk, skb);
1904 		if (dst) {
1905 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1906 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1907 					     dst, 0)) {
1908 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1909 				dst_release(dst);
1910 			}
1911 		}
1912 		tcp_rcv_established(sk, skb);
1913 		return 0;
1914 	}
1915 
1916 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1917 	if (tcp_checksum_complete(skb))
1918 		goto csum_err;
1919 
1920 	if (sk->sk_state == TCP_LISTEN) {
1921 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1922 
1923 		if (!nsk)
1924 			goto discard;
1925 		if (nsk != sk) {
1926 			if (tcp_child_process(sk, nsk, skb)) {
1927 				rsk = nsk;
1928 				goto reset;
1929 			}
1930 			return 0;
1931 		}
1932 	} else
1933 		sock_rps_save_rxhash(sk, skb);
1934 
1935 	if (tcp_rcv_state_process(sk, skb)) {
1936 		rsk = sk;
1937 		goto reset;
1938 	}
1939 	return 0;
1940 
1941 reset:
1942 	tcp_v4_send_reset(rsk, skb);
1943 discard:
1944 	kfree_skb_reason(skb, reason);
1945 	/* Be careful here. If this function gets more complicated and
1946 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1947 	 * might be destroyed here. This current version compiles correctly,
1948 	 * but you have been warned.
1949 	 */
1950 	return 0;
1951 
1952 csum_err:
1953 	reason = SKB_DROP_REASON_TCP_CSUM;
1954 	trace_tcp_bad_csum(skb);
1955 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1956 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1957 	goto discard;
1958 }
1959 EXPORT_SYMBOL(tcp_v4_do_rcv);
1960 
1961 int tcp_v4_early_demux(struct sk_buff *skb)
1962 {
1963 	struct net *net = dev_net(skb->dev);
1964 	const struct iphdr *iph;
1965 	const struct tcphdr *th;
1966 	struct sock *sk;
1967 
1968 	if (skb->pkt_type != PACKET_HOST)
1969 		return 0;
1970 
1971 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1972 		return 0;
1973 
1974 	iph = ip_hdr(skb);
1975 	th = tcp_hdr(skb);
1976 
1977 	if (th->doff < sizeof(struct tcphdr) / 4)
1978 		return 0;
1979 
1980 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1981 				       iph->saddr, th->source,
1982 				       iph->daddr, ntohs(th->dest),
1983 				       skb->skb_iif, inet_sdif(skb));
1984 	if (sk) {
1985 		skb->sk = sk;
1986 		skb->destructor = sock_edemux;
1987 		if (sk_fullsock(sk)) {
1988 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1989 
1990 			if (dst)
1991 				dst = dst_check(dst, 0);
1992 			if (dst &&
1993 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1994 				skb_dst_set_noref(skb, dst);
1995 		}
1996 	}
1997 	return 0;
1998 }
1999 
2000 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2001 		     enum skb_drop_reason *reason)
2002 {
2003 	u32 limit, tail_gso_size, tail_gso_segs;
2004 	struct skb_shared_info *shinfo;
2005 	const struct tcphdr *th;
2006 	struct tcphdr *thtail;
2007 	struct sk_buff *tail;
2008 	unsigned int hdrlen;
2009 	bool fragstolen;
2010 	u32 gso_segs;
2011 	u32 gso_size;
2012 	int delta;
2013 
2014 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2015 	 * we can fix skb->truesize to its real value to avoid future drops.
2016 	 * This is valid because skb is not yet charged to the socket.
2017 	 * It has been noticed pure SACK packets were sometimes dropped
2018 	 * (if cooked by drivers without copybreak feature).
2019 	 */
2020 	skb_condense(skb);
2021 
2022 	skb_dst_drop(skb);
2023 
2024 	if (unlikely(tcp_checksum_complete(skb))) {
2025 		bh_unlock_sock(sk);
2026 		trace_tcp_bad_csum(skb);
2027 		*reason = SKB_DROP_REASON_TCP_CSUM;
2028 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2029 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2030 		return true;
2031 	}
2032 
2033 	/* Attempt coalescing to last skb in backlog, even if we are
2034 	 * above the limits.
2035 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2036 	 */
2037 	th = (const struct tcphdr *)skb->data;
2038 	hdrlen = th->doff * 4;
2039 
2040 	tail = sk->sk_backlog.tail;
2041 	if (!tail)
2042 		goto no_coalesce;
2043 	thtail = (struct tcphdr *)tail->data;
2044 
2045 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2046 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2047 	    ((TCP_SKB_CB(tail)->tcp_flags |
2048 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2049 	    !((TCP_SKB_CB(tail)->tcp_flags &
2050 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2051 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2052 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2053 #ifdef CONFIG_TLS_DEVICE
2054 	    tail->decrypted != skb->decrypted ||
2055 #endif
2056 	    !mptcp_skb_can_collapse(tail, skb) ||
2057 	    thtail->doff != th->doff ||
2058 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2059 		goto no_coalesce;
2060 
2061 	__skb_pull(skb, hdrlen);
2062 
2063 	shinfo = skb_shinfo(skb);
2064 	gso_size = shinfo->gso_size ?: skb->len;
2065 	gso_segs = shinfo->gso_segs ?: 1;
2066 
2067 	shinfo = skb_shinfo(tail);
2068 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2069 	tail_gso_segs = shinfo->gso_segs ?: 1;
2070 
2071 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2072 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2073 
2074 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2075 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2076 			thtail->window = th->window;
2077 		}
2078 
2079 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2080 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2081 		 * is not entered if we append a packet with a FIN.
2082 		 * SYN, RST, URG are not present.
2083 		 * ACK is set on both packets.
2084 		 * PSH : we do not really care in TCP stack,
2085 		 *       at least for 'GRO' packets.
2086 		 */
2087 		thtail->fin |= th->fin;
2088 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2089 
2090 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2091 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2092 			tail->tstamp = skb->tstamp;
2093 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2094 		}
2095 
2096 		/* Not as strict as GRO. We only need to carry mss max value */
2097 		shinfo->gso_size = max(gso_size, tail_gso_size);
2098 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2099 
2100 		sk->sk_backlog.len += delta;
2101 		__NET_INC_STATS(sock_net(sk),
2102 				LINUX_MIB_TCPBACKLOGCOALESCE);
2103 		kfree_skb_partial(skb, fragstolen);
2104 		return false;
2105 	}
2106 	__skb_push(skb, hdrlen);
2107 
2108 no_coalesce:
2109 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
2110 
2111 	/* Only socket owner can try to collapse/prune rx queues
2112 	 * to reduce memory overhead, so add a little headroom here.
2113 	 * Few sockets backlog are possibly concurrently non empty.
2114 	 */
2115 	limit += 64 * 1024;
2116 
2117 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2118 		bh_unlock_sock(sk);
2119 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2120 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2121 		return true;
2122 	}
2123 	return false;
2124 }
2125 EXPORT_SYMBOL(tcp_add_backlog);
2126 
2127 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2128 {
2129 	struct tcphdr *th = (struct tcphdr *)skb->data;
2130 
2131 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2132 }
2133 EXPORT_SYMBOL(tcp_filter);
2134 
2135 static void tcp_v4_restore_cb(struct sk_buff *skb)
2136 {
2137 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2138 		sizeof(struct inet_skb_parm));
2139 }
2140 
2141 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2142 			   const struct tcphdr *th)
2143 {
2144 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2145 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2146 	 */
2147 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2148 		sizeof(struct inet_skb_parm));
2149 	barrier();
2150 
2151 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2152 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2153 				    skb->len - th->doff * 4);
2154 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2155 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2156 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
2157 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2158 	TCP_SKB_CB(skb)->sacked	 = 0;
2159 	TCP_SKB_CB(skb)->has_rxtstamp =
2160 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2161 }
2162 
2163 /*
2164  *	From tcp_input.c
2165  */
2166 
2167 int tcp_v4_rcv(struct sk_buff *skb)
2168 {
2169 	struct net *net = dev_net(skb->dev);
2170 	enum skb_drop_reason drop_reason;
2171 	int sdif = inet_sdif(skb);
2172 	int dif = inet_iif(skb);
2173 	const struct iphdr *iph;
2174 	const struct tcphdr *th;
2175 	bool refcounted;
2176 	struct sock *sk;
2177 	int ret;
2178 
2179 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2180 	if (skb->pkt_type != PACKET_HOST)
2181 		goto discard_it;
2182 
2183 	/* Count it even if it's bad */
2184 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2185 
2186 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2187 		goto discard_it;
2188 
2189 	th = (const struct tcphdr *)skb->data;
2190 
2191 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2192 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2193 		goto bad_packet;
2194 	}
2195 	if (!pskb_may_pull(skb, th->doff * 4))
2196 		goto discard_it;
2197 
2198 	/* An explanation is required here, I think.
2199 	 * Packet length and doff are validated by header prediction,
2200 	 * provided case of th->doff==0 is eliminated.
2201 	 * So, we defer the checks. */
2202 
2203 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2204 		goto csum_error;
2205 
2206 	th = (const struct tcphdr *)skb->data;
2207 	iph = ip_hdr(skb);
2208 lookup:
2209 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2210 			       skb, __tcp_hdrlen(th), th->source,
2211 			       th->dest, sdif, &refcounted);
2212 	if (!sk)
2213 		goto no_tcp_socket;
2214 
2215 process:
2216 	if (sk->sk_state == TCP_TIME_WAIT)
2217 		goto do_time_wait;
2218 
2219 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2220 		struct request_sock *req = inet_reqsk(sk);
2221 		bool req_stolen = false;
2222 		struct sock *nsk;
2223 
2224 		sk = req->rsk_listener;
2225 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2226 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2227 		else
2228 			drop_reason = tcp_inbound_hash(sk, req, skb,
2229 						       &iph->saddr, &iph->daddr,
2230 						       AF_INET, dif, sdif);
2231 		if (unlikely(drop_reason)) {
2232 			sk_drops_add(sk, skb);
2233 			reqsk_put(req);
2234 			goto discard_it;
2235 		}
2236 		if (tcp_checksum_complete(skb)) {
2237 			reqsk_put(req);
2238 			goto csum_error;
2239 		}
2240 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2241 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2242 			if (!nsk) {
2243 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2244 				goto lookup;
2245 			}
2246 			sk = nsk;
2247 			/* reuseport_migrate_sock() has already held one sk_refcnt
2248 			 * before returning.
2249 			 */
2250 		} else {
2251 			/* We own a reference on the listener, increase it again
2252 			 * as we might lose it too soon.
2253 			 */
2254 			sock_hold(sk);
2255 		}
2256 		refcounted = true;
2257 		nsk = NULL;
2258 		if (!tcp_filter(sk, skb)) {
2259 			th = (const struct tcphdr *)skb->data;
2260 			iph = ip_hdr(skb);
2261 			tcp_v4_fill_cb(skb, iph, th);
2262 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2263 		} else {
2264 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2265 		}
2266 		if (!nsk) {
2267 			reqsk_put(req);
2268 			if (req_stolen) {
2269 				/* Another cpu got exclusive access to req
2270 				 * and created a full blown socket.
2271 				 * Try to feed this packet to this socket
2272 				 * instead of discarding it.
2273 				 */
2274 				tcp_v4_restore_cb(skb);
2275 				sock_put(sk);
2276 				goto lookup;
2277 			}
2278 			goto discard_and_relse;
2279 		}
2280 		nf_reset_ct(skb);
2281 		if (nsk == sk) {
2282 			reqsk_put(req);
2283 			tcp_v4_restore_cb(skb);
2284 		} else if (tcp_child_process(sk, nsk, skb)) {
2285 			tcp_v4_send_reset(nsk, skb);
2286 			goto discard_and_relse;
2287 		} else {
2288 			sock_put(sk);
2289 			return 0;
2290 		}
2291 	}
2292 
2293 	if (static_branch_unlikely(&ip4_min_ttl)) {
2294 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2295 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2296 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2297 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2298 			goto discard_and_relse;
2299 		}
2300 	}
2301 
2302 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2303 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2304 		goto discard_and_relse;
2305 	}
2306 
2307 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2308 				       AF_INET, dif, sdif);
2309 	if (drop_reason)
2310 		goto discard_and_relse;
2311 
2312 	nf_reset_ct(skb);
2313 
2314 	if (tcp_filter(sk, skb)) {
2315 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2316 		goto discard_and_relse;
2317 	}
2318 	th = (const struct tcphdr *)skb->data;
2319 	iph = ip_hdr(skb);
2320 	tcp_v4_fill_cb(skb, iph, th);
2321 
2322 	skb->dev = NULL;
2323 
2324 	if (sk->sk_state == TCP_LISTEN) {
2325 		ret = tcp_v4_do_rcv(sk, skb);
2326 		goto put_and_return;
2327 	}
2328 
2329 	sk_incoming_cpu_update(sk);
2330 
2331 	bh_lock_sock_nested(sk);
2332 	tcp_segs_in(tcp_sk(sk), skb);
2333 	ret = 0;
2334 	if (!sock_owned_by_user(sk)) {
2335 		ret = tcp_v4_do_rcv(sk, skb);
2336 	} else {
2337 		if (tcp_add_backlog(sk, skb, &drop_reason))
2338 			goto discard_and_relse;
2339 	}
2340 	bh_unlock_sock(sk);
2341 
2342 put_and_return:
2343 	if (refcounted)
2344 		sock_put(sk);
2345 
2346 	return ret;
2347 
2348 no_tcp_socket:
2349 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2350 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2351 		goto discard_it;
2352 
2353 	tcp_v4_fill_cb(skb, iph, th);
2354 
2355 	if (tcp_checksum_complete(skb)) {
2356 csum_error:
2357 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2358 		trace_tcp_bad_csum(skb);
2359 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2360 bad_packet:
2361 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2362 	} else {
2363 		tcp_v4_send_reset(NULL, skb);
2364 	}
2365 
2366 discard_it:
2367 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2368 	/* Discard frame. */
2369 	kfree_skb_reason(skb, drop_reason);
2370 	return 0;
2371 
2372 discard_and_relse:
2373 	sk_drops_add(sk, skb);
2374 	if (refcounted)
2375 		sock_put(sk);
2376 	goto discard_it;
2377 
2378 do_time_wait:
2379 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2380 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2381 		inet_twsk_put(inet_twsk(sk));
2382 		goto discard_it;
2383 	}
2384 
2385 	tcp_v4_fill_cb(skb, iph, th);
2386 
2387 	if (tcp_checksum_complete(skb)) {
2388 		inet_twsk_put(inet_twsk(sk));
2389 		goto csum_error;
2390 	}
2391 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2392 	case TCP_TW_SYN: {
2393 		struct sock *sk2 = inet_lookup_listener(net,
2394 							net->ipv4.tcp_death_row.hashinfo,
2395 							skb, __tcp_hdrlen(th),
2396 							iph->saddr, th->source,
2397 							iph->daddr, th->dest,
2398 							inet_iif(skb),
2399 							sdif);
2400 		if (sk2) {
2401 			inet_twsk_deschedule_put(inet_twsk(sk));
2402 			sk = sk2;
2403 			tcp_v4_restore_cb(skb);
2404 			refcounted = false;
2405 			goto process;
2406 		}
2407 	}
2408 		/* to ACK */
2409 		fallthrough;
2410 	case TCP_TW_ACK:
2411 		tcp_v4_timewait_ack(sk, skb);
2412 		break;
2413 	case TCP_TW_RST:
2414 		tcp_v4_send_reset(sk, skb);
2415 		inet_twsk_deschedule_put(inet_twsk(sk));
2416 		goto discard_it;
2417 	case TCP_TW_SUCCESS:;
2418 	}
2419 	goto discard_it;
2420 }
2421 
2422 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2423 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2424 	.twsk_unique	= tcp_twsk_unique,
2425 	.twsk_destructor= tcp_twsk_destructor,
2426 };
2427 
2428 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2429 {
2430 	struct dst_entry *dst = skb_dst(skb);
2431 
2432 	if (dst && dst_hold_safe(dst)) {
2433 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2434 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2435 	}
2436 }
2437 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2438 
2439 const struct inet_connection_sock_af_ops ipv4_specific = {
2440 	.queue_xmit	   = ip_queue_xmit,
2441 	.send_check	   = tcp_v4_send_check,
2442 	.rebuild_header	   = inet_sk_rebuild_header,
2443 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2444 	.conn_request	   = tcp_v4_conn_request,
2445 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2446 	.net_header_len	   = sizeof(struct iphdr),
2447 	.setsockopt	   = ip_setsockopt,
2448 	.getsockopt	   = ip_getsockopt,
2449 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2450 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2451 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2452 };
2453 EXPORT_SYMBOL(ipv4_specific);
2454 
2455 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2456 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2457 #ifdef CONFIG_TCP_MD5SIG
2458 	.md5_lookup		= tcp_v4_md5_lookup,
2459 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2460 	.md5_parse		= tcp_v4_parse_md5_keys,
2461 #endif
2462 #ifdef CONFIG_TCP_AO
2463 	.ao_lookup		= tcp_v4_ao_lookup,
2464 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2465 	.ao_parse		= tcp_v4_parse_ao,
2466 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2467 #endif
2468 };
2469 #endif
2470 
2471 /* NOTE: A lot of things set to zero explicitly by call to
2472  *       sk_alloc() so need not be done here.
2473  */
2474 static int tcp_v4_init_sock(struct sock *sk)
2475 {
2476 	struct inet_connection_sock *icsk = inet_csk(sk);
2477 
2478 	tcp_init_sock(sk);
2479 
2480 	icsk->icsk_af_ops = &ipv4_specific;
2481 
2482 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2483 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2484 #endif
2485 
2486 	return 0;
2487 }
2488 
2489 #ifdef CONFIG_TCP_MD5SIG
2490 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2491 {
2492 	struct tcp_md5sig_info *md5sig;
2493 
2494 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2495 	kfree(md5sig);
2496 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2497 	tcp_md5_release_sigpool();
2498 }
2499 #endif
2500 
2501 void tcp_v4_destroy_sock(struct sock *sk)
2502 {
2503 	struct tcp_sock *tp = tcp_sk(sk);
2504 
2505 	trace_tcp_destroy_sock(sk);
2506 
2507 	tcp_clear_xmit_timers(sk);
2508 
2509 	tcp_cleanup_congestion_control(sk);
2510 
2511 	tcp_cleanup_ulp(sk);
2512 
2513 	/* Cleanup up the write buffer. */
2514 	tcp_write_queue_purge(sk);
2515 
2516 	/* Check if we want to disable active TFO */
2517 	tcp_fastopen_active_disable_ofo_check(sk);
2518 
2519 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2520 	skb_rbtree_purge(&tp->out_of_order_queue);
2521 
2522 #ifdef CONFIG_TCP_MD5SIG
2523 	/* Clean up the MD5 key list, if any */
2524 	if (tp->md5sig_info) {
2525 		struct tcp_md5sig_info *md5sig;
2526 
2527 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2528 		tcp_clear_md5_list(sk);
2529 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2530 		rcu_assign_pointer(tp->md5sig_info, NULL);
2531 	}
2532 #endif
2533 	tcp_ao_destroy_sock(sk, false);
2534 
2535 	/* Clean up a referenced TCP bind bucket. */
2536 	if (inet_csk(sk)->icsk_bind_hash)
2537 		inet_put_port(sk);
2538 
2539 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2540 
2541 	/* If socket is aborted during connect operation */
2542 	tcp_free_fastopen_req(tp);
2543 	tcp_fastopen_destroy_cipher(sk);
2544 	tcp_saved_syn_free(tp);
2545 
2546 	sk_sockets_allocated_dec(sk);
2547 }
2548 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2549 
2550 #ifdef CONFIG_PROC_FS
2551 /* Proc filesystem TCP sock list dumping. */
2552 
2553 static unsigned short seq_file_family(const struct seq_file *seq);
2554 
2555 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2556 {
2557 	unsigned short family = seq_file_family(seq);
2558 
2559 	/* AF_UNSPEC is used as a match all */
2560 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2561 		net_eq(sock_net(sk), seq_file_net(seq)));
2562 }
2563 
2564 /* Find a non empty bucket (starting from st->bucket)
2565  * and return the first sk from it.
2566  */
2567 static void *listening_get_first(struct seq_file *seq)
2568 {
2569 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2570 	struct tcp_iter_state *st = seq->private;
2571 
2572 	st->offset = 0;
2573 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2574 		struct inet_listen_hashbucket *ilb2;
2575 		struct hlist_nulls_node *node;
2576 		struct sock *sk;
2577 
2578 		ilb2 = &hinfo->lhash2[st->bucket];
2579 		if (hlist_nulls_empty(&ilb2->nulls_head))
2580 			continue;
2581 
2582 		spin_lock(&ilb2->lock);
2583 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2584 			if (seq_sk_match(seq, sk))
2585 				return sk;
2586 		}
2587 		spin_unlock(&ilb2->lock);
2588 	}
2589 
2590 	return NULL;
2591 }
2592 
2593 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2594  * If "cur" is the last one in the st->bucket,
2595  * call listening_get_first() to return the first sk of the next
2596  * non empty bucket.
2597  */
2598 static void *listening_get_next(struct seq_file *seq, void *cur)
2599 {
2600 	struct tcp_iter_state *st = seq->private;
2601 	struct inet_listen_hashbucket *ilb2;
2602 	struct hlist_nulls_node *node;
2603 	struct inet_hashinfo *hinfo;
2604 	struct sock *sk = cur;
2605 
2606 	++st->num;
2607 	++st->offset;
2608 
2609 	sk = sk_nulls_next(sk);
2610 	sk_nulls_for_each_from(sk, node) {
2611 		if (seq_sk_match(seq, sk))
2612 			return sk;
2613 	}
2614 
2615 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2616 	ilb2 = &hinfo->lhash2[st->bucket];
2617 	spin_unlock(&ilb2->lock);
2618 	++st->bucket;
2619 	return listening_get_first(seq);
2620 }
2621 
2622 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2623 {
2624 	struct tcp_iter_state *st = seq->private;
2625 	void *rc;
2626 
2627 	st->bucket = 0;
2628 	st->offset = 0;
2629 	rc = listening_get_first(seq);
2630 
2631 	while (rc && *pos) {
2632 		rc = listening_get_next(seq, rc);
2633 		--*pos;
2634 	}
2635 	return rc;
2636 }
2637 
2638 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2639 				const struct tcp_iter_state *st)
2640 {
2641 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2642 }
2643 
2644 /*
2645  * Get first established socket starting from bucket given in st->bucket.
2646  * If st->bucket is zero, the very first socket in the hash is returned.
2647  */
2648 static void *established_get_first(struct seq_file *seq)
2649 {
2650 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2651 	struct tcp_iter_state *st = seq->private;
2652 
2653 	st->offset = 0;
2654 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2655 		struct sock *sk;
2656 		struct hlist_nulls_node *node;
2657 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2658 
2659 		cond_resched();
2660 
2661 		/* Lockless fast path for the common case of empty buckets */
2662 		if (empty_bucket(hinfo, st))
2663 			continue;
2664 
2665 		spin_lock_bh(lock);
2666 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2667 			if (seq_sk_match(seq, sk))
2668 				return sk;
2669 		}
2670 		spin_unlock_bh(lock);
2671 	}
2672 
2673 	return NULL;
2674 }
2675 
2676 static void *established_get_next(struct seq_file *seq, void *cur)
2677 {
2678 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2679 	struct tcp_iter_state *st = seq->private;
2680 	struct hlist_nulls_node *node;
2681 	struct sock *sk = cur;
2682 
2683 	++st->num;
2684 	++st->offset;
2685 
2686 	sk = sk_nulls_next(sk);
2687 
2688 	sk_nulls_for_each_from(sk, node) {
2689 		if (seq_sk_match(seq, sk))
2690 			return sk;
2691 	}
2692 
2693 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2694 	++st->bucket;
2695 	return established_get_first(seq);
2696 }
2697 
2698 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2699 {
2700 	struct tcp_iter_state *st = seq->private;
2701 	void *rc;
2702 
2703 	st->bucket = 0;
2704 	rc = established_get_first(seq);
2705 
2706 	while (rc && pos) {
2707 		rc = established_get_next(seq, rc);
2708 		--pos;
2709 	}
2710 	return rc;
2711 }
2712 
2713 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2714 {
2715 	void *rc;
2716 	struct tcp_iter_state *st = seq->private;
2717 
2718 	st->state = TCP_SEQ_STATE_LISTENING;
2719 	rc	  = listening_get_idx(seq, &pos);
2720 
2721 	if (!rc) {
2722 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2723 		rc	  = established_get_idx(seq, pos);
2724 	}
2725 
2726 	return rc;
2727 }
2728 
2729 static void *tcp_seek_last_pos(struct seq_file *seq)
2730 {
2731 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2732 	struct tcp_iter_state *st = seq->private;
2733 	int bucket = st->bucket;
2734 	int offset = st->offset;
2735 	int orig_num = st->num;
2736 	void *rc = NULL;
2737 
2738 	switch (st->state) {
2739 	case TCP_SEQ_STATE_LISTENING:
2740 		if (st->bucket > hinfo->lhash2_mask)
2741 			break;
2742 		rc = listening_get_first(seq);
2743 		while (offset-- && rc && bucket == st->bucket)
2744 			rc = listening_get_next(seq, rc);
2745 		if (rc)
2746 			break;
2747 		st->bucket = 0;
2748 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2749 		fallthrough;
2750 	case TCP_SEQ_STATE_ESTABLISHED:
2751 		if (st->bucket > hinfo->ehash_mask)
2752 			break;
2753 		rc = established_get_first(seq);
2754 		while (offset-- && rc && bucket == st->bucket)
2755 			rc = established_get_next(seq, rc);
2756 	}
2757 
2758 	st->num = orig_num;
2759 
2760 	return rc;
2761 }
2762 
2763 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2764 {
2765 	struct tcp_iter_state *st = seq->private;
2766 	void *rc;
2767 
2768 	if (*pos && *pos == st->last_pos) {
2769 		rc = tcp_seek_last_pos(seq);
2770 		if (rc)
2771 			goto out;
2772 	}
2773 
2774 	st->state = TCP_SEQ_STATE_LISTENING;
2775 	st->num = 0;
2776 	st->bucket = 0;
2777 	st->offset = 0;
2778 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2779 
2780 out:
2781 	st->last_pos = *pos;
2782 	return rc;
2783 }
2784 EXPORT_SYMBOL(tcp_seq_start);
2785 
2786 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2787 {
2788 	struct tcp_iter_state *st = seq->private;
2789 	void *rc = NULL;
2790 
2791 	if (v == SEQ_START_TOKEN) {
2792 		rc = tcp_get_idx(seq, 0);
2793 		goto out;
2794 	}
2795 
2796 	switch (st->state) {
2797 	case TCP_SEQ_STATE_LISTENING:
2798 		rc = listening_get_next(seq, v);
2799 		if (!rc) {
2800 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2801 			st->bucket = 0;
2802 			st->offset = 0;
2803 			rc	  = established_get_first(seq);
2804 		}
2805 		break;
2806 	case TCP_SEQ_STATE_ESTABLISHED:
2807 		rc = established_get_next(seq, v);
2808 		break;
2809 	}
2810 out:
2811 	++*pos;
2812 	st->last_pos = *pos;
2813 	return rc;
2814 }
2815 EXPORT_SYMBOL(tcp_seq_next);
2816 
2817 void tcp_seq_stop(struct seq_file *seq, void *v)
2818 {
2819 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2820 	struct tcp_iter_state *st = seq->private;
2821 
2822 	switch (st->state) {
2823 	case TCP_SEQ_STATE_LISTENING:
2824 		if (v != SEQ_START_TOKEN)
2825 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2826 		break;
2827 	case TCP_SEQ_STATE_ESTABLISHED:
2828 		if (v)
2829 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2830 		break;
2831 	}
2832 }
2833 EXPORT_SYMBOL(tcp_seq_stop);
2834 
2835 static void get_openreq4(const struct request_sock *req,
2836 			 struct seq_file *f, int i)
2837 {
2838 	const struct inet_request_sock *ireq = inet_rsk(req);
2839 	long delta = req->rsk_timer.expires - jiffies;
2840 
2841 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2842 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2843 		i,
2844 		ireq->ir_loc_addr,
2845 		ireq->ir_num,
2846 		ireq->ir_rmt_addr,
2847 		ntohs(ireq->ir_rmt_port),
2848 		TCP_SYN_RECV,
2849 		0, 0, /* could print option size, but that is af dependent. */
2850 		1,    /* timers active (only the expire timer) */
2851 		jiffies_delta_to_clock_t(delta),
2852 		req->num_timeout,
2853 		from_kuid_munged(seq_user_ns(f),
2854 				 sock_i_uid(req->rsk_listener)),
2855 		0,  /* non standard timer */
2856 		0, /* open_requests have no inode */
2857 		0,
2858 		req);
2859 }
2860 
2861 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2862 {
2863 	int timer_active;
2864 	unsigned long timer_expires;
2865 	const struct tcp_sock *tp = tcp_sk(sk);
2866 	const struct inet_connection_sock *icsk = inet_csk(sk);
2867 	const struct inet_sock *inet = inet_sk(sk);
2868 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2869 	__be32 dest = inet->inet_daddr;
2870 	__be32 src = inet->inet_rcv_saddr;
2871 	__u16 destp = ntohs(inet->inet_dport);
2872 	__u16 srcp = ntohs(inet->inet_sport);
2873 	int rx_queue;
2874 	int state;
2875 
2876 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2877 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2878 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2879 		timer_active	= 1;
2880 		timer_expires	= icsk->icsk_timeout;
2881 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2882 		timer_active	= 4;
2883 		timer_expires	= icsk->icsk_timeout;
2884 	} else if (timer_pending(&sk->sk_timer)) {
2885 		timer_active	= 2;
2886 		timer_expires	= sk->sk_timer.expires;
2887 	} else {
2888 		timer_active	= 0;
2889 		timer_expires = jiffies;
2890 	}
2891 
2892 	state = inet_sk_state_load(sk);
2893 	if (state == TCP_LISTEN)
2894 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2895 	else
2896 		/* Because we don't lock the socket,
2897 		 * we might find a transient negative value.
2898 		 */
2899 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2900 				      READ_ONCE(tp->copied_seq), 0);
2901 
2902 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2903 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2904 		i, src, srcp, dest, destp, state,
2905 		READ_ONCE(tp->write_seq) - tp->snd_una,
2906 		rx_queue,
2907 		timer_active,
2908 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2909 		icsk->icsk_retransmits,
2910 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2911 		icsk->icsk_probes_out,
2912 		sock_i_ino(sk),
2913 		refcount_read(&sk->sk_refcnt), sk,
2914 		jiffies_to_clock_t(icsk->icsk_rto),
2915 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2916 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2917 		tcp_snd_cwnd(tp),
2918 		state == TCP_LISTEN ?
2919 		    fastopenq->max_qlen :
2920 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2921 }
2922 
2923 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2924 			       struct seq_file *f, int i)
2925 {
2926 	long delta = tw->tw_timer.expires - jiffies;
2927 	__be32 dest, src;
2928 	__u16 destp, srcp;
2929 
2930 	dest  = tw->tw_daddr;
2931 	src   = tw->tw_rcv_saddr;
2932 	destp = ntohs(tw->tw_dport);
2933 	srcp  = ntohs(tw->tw_sport);
2934 
2935 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2936 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2937 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2938 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2939 		refcount_read(&tw->tw_refcnt), tw);
2940 }
2941 
2942 #define TMPSZ 150
2943 
2944 static int tcp4_seq_show(struct seq_file *seq, void *v)
2945 {
2946 	struct tcp_iter_state *st;
2947 	struct sock *sk = v;
2948 
2949 	seq_setwidth(seq, TMPSZ - 1);
2950 	if (v == SEQ_START_TOKEN) {
2951 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2952 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2953 			   "inode");
2954 		goto out;
2955 	}
2956 	st = seq->private;
2957 
2958 	if (sk->sk_state == TCP_TIME_WAIT)
2959 		get_timewait4_sock(v, seq, st->num);
2960 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2961 		get_openreq4(v, seq, st->num);
2962 	else
2963 		get_tcp4_sock(v, seq, st->num);
2964 out:
2965 	seq_pad(seq, '\n');
2966 	return 0;
2967 }
2968 
2969 #ifdef CONFIG_BPF_SYSCALL
2970 struct bpf_tcp_iter_state {
2971 	struct tcp_iter_state state;
2972 	unsigned int cur_sk;
2973 	unsigned int end_sk;
2974 	unsigned int max_sk;
2975 	struct sock **batch;
2976 	bool st_bucket_done;
2977 };
2978 
2979 struct bpf_iter__tcp {
2980 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2981 	__bpf_md_ptr(struct sock_common *, sk_common);
2982 	uid_t uid __aligned(8);
2983 };
2984 
2985 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2986 			     struct sock_common *sk_common, uid_t uid)
2987 {
2988 	struct bpf_iter__tcp ctx;
2989 
2990 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2991 	ctx.meta = meta;
2992 	ctx.sk_common = sk_common;
2993 	ctx.uid = uid;
2994 	return bpf_iter_run_prog(prog, &ctx);
2995 }
2996 
2997 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2998 {
2999 	while (iter->cur_sk < iter->end_sk)
3000 		sock_gen_put(iter->batch[iter->cur_sk++]);
3001 }
3002 
3003 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3004 				      unsigned int new_batch_sz)
3005 {
3006 	struct sock **new_batch;
3007 
3008 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3009 			     GFP_USER | __GFP_NOWARN);
3010 	if (!new_batch)
3011 		return -ENOMEM;
3012 
3013 	bpf_iter_tcp_put_batch(iter);
3014 	kvfree(iter->batch);
3015 	iter->batch = new_batch;
3016 	iter->max_sk = new_batch_sz;
3017 
3018 	return 0;
3019 }
3020 
3021 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3022 						 struct sock *start_sk)
3023 {
3024 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3025 	struct bpf_tcp_iter_state *iter = seq->private;
3026 	struct tcp_iter_state *st = &iter->state;
3027 	struct hlist_nulls_node *node;
3028 	unsigned int expected = 1;
3029 	struct sock *sk;
3030 
3031 	sock_hold(start_sk);
3032 	iter->batch[iter->end_sk++] = start_sk;
3033 
3034 	sk = sk_nulls_next(start_sk);
3035 	sk_nulls_for_each_from(sk, node) {
3036 		if (seq_sk_match(seq, sk)) {
3037 			if (iter->end_sk < iter->max_sk) {
3038 				sock_hold(sk);
3039 				iter->batch[iter->end_sk++] = sk;
3040 			}
3041 			expected++;
3042 		}
3043 	}
3044 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3045 
3046 	return expected;
3047 }
3048 
3049 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3050 						   struct sock *start_sk)
3051 {
3052 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3053 	struct bpf_tcp_iter_state *iter = seq->private;
3054 	struct tcp_iter_state *st = &iter->state;
3055 	struct hlist_nulls_node *node;
3056 	unsigned int expected = 1;
3057 	struct sock *sk;
3058 
3059 	sock_hold(start_sk);
3060 	iter->batch[iter->end_sk++] = start_sk;
3061 
3062 	sk = sk_nulls_next(start_sk);
3063 	sk_nulls_for_each_from(sk, node) {
3064 		if (seq_sk_match(seq, sk)) {
3065 			if (iter->end_sk < iter->max_sk) {
3066 				sock_hold(sk);
3067 				iter->batch[iter->end_sk++] = sk;
3068 			}
3069 			expected++;
3070 		}
3071 	}
3072 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3073 
3074 	return expected;
3075 }
3076 
3077 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3078 {
3079 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3080 	struct bpf_tcp_iter_state *iter = seq->private;
3081 	struct tcp_iter_state *st = &iter->state;
3082 	unsigned int expected;
3083 	bool resized = false;
3084 	struct sock *sk;
3085 
3086 	/* The st->bucket is done.  Directly advance to the next
3087 	 * bucket instead of having the tcp_seek_last_pos() to skip
3088 	 * one by one in the current bucket and eventually find out
3089 	 * it has to advance to the next bucket.
3090 	 */
3091 	if (iter->st_bucket_done) {
3092 		st->offset = 0;
3093 		st->bucket++;
3094 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3095 		    st->bucket > hinfo->lhash2_mask) {
3096 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3097 			st->bucket = 0;
3098 		}
3099 	}
3100 
3101 again:
3102 	/* Get a new batch */
3103 	iter->cur_sk = 0;
3104 	iter->end_sk = 0;
3105 	iter->st_bucket_done = false;
3106 
3107 	sk = tcp_seek_last_pos(seq);
3108 	if (!sk)
3109 		return NULL; /* Done */
3110 
3111 	if (st->state == TCP_SEQ_STATE_LISTENING)
3112 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3113 	else
3114 		expected = bpf_iter_tcp_established_batch(seq, sk);
3115 
3116 	if (iter->end_sk == expected) {
3117 		iter->st_bucket_done = true;
3118 		return sk;
3119 	}
3120 
3121 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3122 		resized = true;
3123 		goto again;
3124 	}
3125 
3126 	return sk;
3127 }
3128 
3129 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3130 {
3131 	/* bpf iter does not support lseek, so it always
3132 	 * continue from where it was stop()-ped.
3133 	 */
3134 	if (*pos)
3135 		return bpf_iter_tcp_batch(seq);
3136 
3137 	return SEQ_START_TOKEN;
3138 }
3139 
3140 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3141 {
3142 	struct bpf_tcp_iter_state *iter = seq->private;
3143 	struct tcp_iter_state *st = &iter->state;
3144 	struct sock *sk;
3145 
3146 	/* Whenever seq_next() is called, the iter->cur_sk is
3147 	 * done with seq_show(), so advance to the next sk in
3148 	 * the batch.
3149 	 */
3150 	if (iter->cur_sk < iter->end_sk) {
3151 		/* Keeping st->num consistent in tcp_iter_state.
3152 		 * bpf_iter_tcp does not use st->num.
3153 		 * meta.seq_num is used instead.
3154 		 */
3155 		st->num++;
3156 		/* Move st->offset to the next sk in the bucket such that
3157 		 * the future start() will resume at st->offset in
3158 		 * st->bucket.  See tcp_seek_last_pos().
3159 		 */
3160 		st->offset++;
3161 		sock_gen_put(iter->batch[iter->cur_sk++]);
3162 	}
3163 
3164 	if (iter->cur_sk < iter->end_sk)
3165 		sk = iter->batch[iter->cur_sk];
3166 	else
3167 		sk = bpf_iter_tcp_batch(seq);
3168 
3169 	++*pos;
3170 	/* Keeping st->last_pos consistent in tcp_iter_state.
3171 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3172 	 */
3173 	st->last_pos = *pos;
3174 	return sk;
3175 }
3176 
3177 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3178 {
3179 	struct bpf_iter_meta meta;
3180 	struct bpf_prog *prog;
3181 	struct sock *sk = v;
3182 	uid_t uid;
3183 	int ret;
3184 
3185 	if (v == SEQ_START_TOKEN)
3186 		return 0;
3187 
3188 	if (sk_fullsock(sk))
3189 		lock_sock(sk);
3190 
3191 	if (unlikely(sk_unhashed(sk))) {
3192 		ret = SEQ_SKIP;
3193 		goto unlock;
3194 	}
3195 
3196 	if (sk->sk_state == TCP_TIME_WAIT) {
3197 		uid = 0;
3198 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3199 		const struct request_sock *req = v;
3200 
3201 		uid = from_kuid_munged(seq_user_ns(seq),
3202 				       sock_i_uid(req->rsk_listener));
3203 	} else {
3204 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3205 	}
3206 
3207 	meta.seq = seq;
3208 	prog = bpf_iter_get_info(&meta, false);
3209 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3210 
3211 unlock:
3212 	if (sk_fullsock(sk))
3213 		release_sock(sk);
3214 	return ret;
3215 
3216 }
3217 
3218 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3219 {
3220 	struct bpf_tcp_iter_state *iter = seq->private;
3221 	struct bpf_iter_meta meta;
3222 	struct bpf_prog *prog;
3223 
3224 	if (!v) {
3225 		meta.seq = seq;
3226 		prog = bpf_iter_get_info(&meta, true);
3227 		if (prog)
3228 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3229 	}
3230 
3231 	if (iter->cur_sk < iter->end_sk) {
3232 		bpf_iter_tcp_put_batch(iter);
3233 		iter->st_bucket_done = false;
3234 	}
3235 }
3236 
3237 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3238 	.show		= bpf_iter_tcp_seq_show,
3239 	.start		= bpf_iter_tcp_seq_start,
3240 	.next		= bpf_iter_tcp_seq_next,
3241 	.stop		= bpf_iter_tcp_seq_stop,
3242 };
3243 #endif
3244 static unsigned short seq_file_family(const struct seq_file *seq)
3245 {
3246 	const struct tcp_seq_afinfo *afinfo;
3247 
3248 #ifdef CONFIG_BPF_SYSCALL
3249 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3250 	if (seq->op == &bpf_iter_tcp_seq_ops)
3251 		return AF_UNSPEC;
3252 #endif
3253 
3254 	/* Iterated from proc fs */
3255 	afinfo = pde_data(file_inode(seq->file));
3256 	return afinfo->family;
3257 }
3258 
3259 static const struct seq_operations tcp4_seq_ops = {
3260 	.show		= tcp4_seq_show,
3261 	.start		= tcp_seq_start,
3262 	.next		= tcp_seq_next,
3263 	.stop		= tcp_seq_stop,
3264 };
3265 
3266 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3267 	.family		= AF_INET,
3268 };
3269 
3270 static int __net_init tcp4_proc_init_net(struct net *net)
3271 {
3272 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3273 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3274 		return -ENOMEM;
3275 	return 0;
3276 }
3277 
3278 static void __net_exit tcp4_proc_exit_net(struct net *net)
3279 {
3280 	remove_proc_entry("tcp", net->proc_net);
3281 }
3282 
3283 static struct pernet_operations tcp4_net_ops = {
3284 	.init = tcp4_proc_init_net,
3285 	.exit = tcp4_proc_exit_net,
3286 };
3287 
3288 int __init tcp4_proc_init(void)
3289 {
3290 	return register_pernet_subsys(&tcp4_net_ops);
3291 }
3292 
3293 void tcp4_proc_exit(void)
3294 {
3295 	unregister_pernet_subsys(&tcp4_net_ops);
3296 }
3297 #endif /* CONFIG_PROC_FS */
3298 
3299 /* @wake is one when sk_stream_write_space() calls us.
3300  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3301  * This mimics the strategy used in sock_def_write_space().
3302  */
3303 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3304 {
3305 	const struct tcp_sock *tp = tcp_sk(sk);
3306 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3307 			    READ_ONCE(tp->snd_nxt);
3308 
3309 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3310 }
3311 EXPORT_SYMBOL(tcp_stream_memory_free);
3312 
3313 struct proto tcp_prot = {
3314 	.name			= "TCP",
3315 	.owner			= THIS_MODULE,
3316 	.close			= tcp_close,
3317 	.pre_connect		= tcp_v4_pre_connect,
3318 	.connect		= tcp_v4_connect,
3319 	.disconnect		= tcp_disconnect,
3320 	.accept			= inet_csk_accept,
3321 	.ioctl			= tcp_ioctl,
3322 	.init			= tcp_v4_init_sock,
3323 	.destroy		= tcp_v4_destroy_sock,
3324 	.shutdown		= tcp_shutdown,
3325 	.setsockopt		= tcp_setsockopt,
3326 	.getsockopt		= tcp_getsockopt,
3327 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3328 	.keepalive		= tcp_set_keepalive,
3329 	.recvmsg		= tcp_recvmsg,
3330 	.sendmsg		= tcp_sendmsg,
3331 	.splice_eof		= tcp_splice_eof,
3332 	.backlog_rcv		= tcp_v4_do_rcv,
3333 	.release_cb		= tcp_release_cb,
3334 	.hash			= inet_hash,
3335 	.unhash			= inet_unhash,
3336 	.get_port		= inet_csk_get_port,
3337 	.put_port		= inet_put_port,
3338 #ifdef CONFIG_BPF_SYSCALL
3339 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3340 #endif
3341 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3342 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3343 	.stream_memory_free	= tcp_stream_memory_free,
3344 	.sockets_allocated	= &tcp_sockets_allocated,
3345 	.orphan_count		= &tcp_orphan_count,
3346 
3347 	.memory_allocated	= &tcp_memory_allocated,
3348 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3349 
3350 	.memory_pressure	= &tcp_memory_pressure,
3351 	.sysctl_mem		= sysctl_tcp_mem,
3352 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3353 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3354 	.max_header		= MAX_TCP_HEADER,
3355 	.obj_size		= sizeof(struct tcp_sock),
3356 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3357 	.twsk_prot		= &tcp_timewait_sock_ops,
3358 	.rsk_prot		= &tcp_request_sock_ops,
3359 	.h.hashinfo		= NULL,
3360 	.no_autobind		= true,
3361 	.diag_destroy		= tcp_abort,
3362 };
3363 EXPORT_SYMBOL(tcp_prot);
3364 
3365 static void __net_exit tcp_sk_exit(struct net *net)
3366 {
3367 	if (net->ipv4.tcp_congestion_control)
3368 		bpf_module_put(net->ipv4.tcp_congestion_control,
3369 			       net->ipv4.tcp_congestion_control->owner);
3370 }
3371 
3372 static void __net_init tcp_set_hashinfo(struct net *net)
3373 {
3374 	struct inet_hashinfo *hinfo;
3375 	unsigned int ehash_entries;
3376 	struct net *old_net;
3377 
3378 	if (net_eq(net, &init_net))
3379 		goto fallback;
3380 
3381 	old_net = current->nsproxy->net_ns;
3382 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3383 	if (!ehash_entries)
3384 		goto fallback;
3385 
3386 	ehash_entries = roundup_pow_of_two(ehash_entries);
3387 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3388 	if (!hinfo) {
3389 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3390 			"for a netns, fallback to the global one\n",
3391 			ehash_entries);
3392 fallback:
3393 		hinfo = &tcp_hashinfo;
3394 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3395 	}
3396 
3397 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3398 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3399 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3400 }
3401 
3402 static int __net_init tcp_sk_init(struct net *net)
3403 {
3404 	net->ipv4.sysctl_tcp_ecn = 2;
3405 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3406 
3407 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3408 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3409 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3410 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3411 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3412 
3413 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3414 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3415 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3416 
3417 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3418 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3419 	net->ipv4.sysctl_tcp_syncookies = 1;
3420 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3421 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3422 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3423 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3424 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3425 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3426 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3427 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3428 
3429 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3430 	tcp_set_hashinfo(net);
3431 
3432 	net->ipv4.sysctl_tcp_sack = 1;
3433 	net->ipv4.sysctl_tcp_window_scaling = 1;
3434 	net->ipv4.sysctl_tcp_timestamps = 1;
3435 	net->ipv4.sysctl_tcp_early_retrans = 3;
3436 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3437 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3438 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3439 	net->ipv4.sysctl_tcp_max_reordering = 300;
3440 	net->ipv4.sysctl_tcp_dsack = 1;
3441 	net->ipv4.sysctl_tcp_app_win = 31;
3442 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3443 	net->ipv4.sysctl_tcp_frto = 2;
3444 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3445 	/* This limits the percentage of the congestion window which we
3446 	 * will allow a single TSO frame to consume.  Building TSO frames
3447 	 * which are too large can cause TCP streams to be bursty.
3448 	 */
3449 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3450 	/* Default TSQ limit of 16 TSO segments */
3451 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3452 
3453 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3454 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3455 
3456 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3457 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3458 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3459 	net->ipv4.sysctl_tcp_autocorking = 1;
3460 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3461 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3462 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3463 	if (net != &init_net) {
3464 		memcpy(net->ipv4.sysctl_tcp_rmem,
3465 		       init_net.ipv4.sysctl_tcp_rmem,
3466 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3467 		memcpy(net->ipv4.sysctl_tcp_wmem,
3468 		       init_net.ipv4.sysctl_tcp_wmem,
3469 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3470 	}
3471 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3472 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3473 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3474 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3475 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3476 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3477 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3478 
3479 	/* Set default values for PLB */
3480 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3481 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3482 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3483 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3484 	/* Default congestion threshold for PLB to mark a round is 50% */
3485 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3486 
3487 	/* Reno is always built in */
3488 	if (!net_eq(net, &init_net) &&
3489 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3490 			       init_net.ipv4.tcp_congestion_control->owner))
3491 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3492 	else
3493 		net->ipv4.tcp_congestion_control = &tcp_reno;
3494 
3495 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3496 	net->ipv4.sysctl_tcp_shrink_window = 0;
3497 
3498 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3499 
3500 	return 0;
3501 }
3502 
3503 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3504 {
3505 	struct net *net;
3506 
3507 	tcp_twsk_purge(net_exit_list, AF_INET);
3508 
3509 	list_for_each_entry(net, net_exit_list, exit_list) {
3510 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3511 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3512 		tcp_fastopen_ctx_destroy(net);
3513 	}
3514 }
3515 
3516 static struct pernet_operations __net_initdata tcp_sk_ops = {
3517        .init	   = tcp_sk_init,
3518        .exit	   = tcp_sk_exit,
3519        .exit_batch = tcp_sk_exit_batch,
3520 };
3521 
3522 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3523 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3524 		     struct sock_common *sk_common, uid_t uid)
3525 
3526 #define INIT_BATCH_SZ 16
3527 
3528 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3529 {
3530 	struct bpf_tcp_iter_state *iter = priv_data;
3531 	int err;
3532 
3533 	err = bpf_iter_init_seq_net(priv_data, aux);
3534 	if (err)
3535 		return err;
3536 
3537 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3538 	if (err) {
3539 		bpf_iter_fini_seq_net(priv_data);
3540 		return err;
3541 	}
3542 
3543 	return 0;
3544 }
3545 
3546 static void bpf_iter_fini_tcp(void *priv_data)
3547 {
3548 	struct bpf_tcp_iter_state *iter = priv_data;
3549 
3550 	bpf_iter_fini_seq_net(priv_data);
3551 	kvfree(iter->batch);
3552 }
3553 
3554 static const struct bpf_iter_seq_info tcp_seq_info = {
3555 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3556 	.init_seq_private	= bpf_iter_init_tcp,
3557 	.fini_seq_private	= bpf_iter_fini_tcp,
3558 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3559 };
3560 
3561 static const struct bpf_func_proto *
3562 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3563 			    const struct bpf_prog *prog)
3564 {
3565 	switch (func_id) {
3566 	case BPF_FUNC_setsockopt:
3567 		return &bpf_sk_setsockopt_proto;
3568 	case BPF_FUNC_getsockopt:
3569 		return &bpf_sk_getsockopt_proto;
3570 	default:
3571 		return NULL;
3572 	}
3573 }
3574 
3575 static struct bpf_iter_reg tcp_reg_info = {
3576 	.target			= "tcp",
3577 	.ctx_arg_info_size	= 1,
3578 	.ctx_arg_info		= {
3579 		{ offsetof(struct bpf_iter__tcp, sk_common),
3580 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3581 	},
3582 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3583 	.seq_info		= &tcp_seq_info,
3584 };
3585 
3586 static void __init bpf_iter_register(void)
3587 {
3588 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3589 	if (bpf_iter_reg_target(&tcp_reg_info))
3590 		pr_warn("Warning: could not register bpf iterator tcp\n");
3591 }
3592 
3593 #endif
3594 
3595 void __init tcp_v4_init(void)
3596 {
3597 	int cpu, res;
3598 
3599 	for_each_possible_cpu(cpu) {
3600 		struct sock *sk;
3601 
3602 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3603 					   IPPROTO_TCP, &init_net);
3604 		if (res)
3605 			panic("Failed to create the TCP control socket.\n");
3606 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3607 
3608 		/* Please enforce IP_DF and IPID==0 for RST and
3609 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3610 		 */
3611 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3612 
3613 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3614 	}
3615 	if (register_pernet_subsys(&tcp_sk_ops))
3616 		panic("Failed to create the TCP control socket.\n");
3617 
3618 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3619 	bpf_iter_register();
3620 #endif
3621 }
3622