xref: /linux/net/ipv4/tcp_ipv4.c (revision 1a9239bb4253f9076b5b4b2a1a4e8d7defd77a95)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/inet_ecn.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/secure_seq.h>
73 #include <net/busy_poll.h>
74 #include <net/rstreason.h>
75 
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81 #include <linux/inetdevice.h>
82 #include <linux/btf_ids.h>
83 #include <linux/skbuff_ref.h>
84 
85 #include <crypto/hash.h>
86 #include <linux/scatterlist.h>
87 
88 #include <trace/events/tcp.h>
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
93 #endif
94 
95 struct inet_hashinfo tcp_hashinfo;
96 
97 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
99 };
100 
101 static DEFINE_MUTEX(tcp_exit_batch_mutex);
102 
tcp_v4_init_seq(const struct sk_buff * skb)103 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
104 {
105 	return secure_tcp_seq(ip_hdr(skb)->daddr,
106 			      ip_hdr(skb)->saddr,
107 			      tcp_hdr(skb)->dest,
108 			      tcp_hdr(skb)->source);
109 }
110 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)111 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
112 {
113 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
114 }
115 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 {
118 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
119 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
120 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
121 	struct tcp_sock *tp = tcp_sk(sk);
122 	int ts_recent_stamp;
123 	u32 reuse_thresh;
124 
125 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
126 		reuse = 0;
127 
128 	if (reuse == 2) {
129 		/* Still does not detect *everything* that goes through
130 		 * lo, since we require a loopback src or dst address
131 		 * or direct binding to 'lo' interface.
132 		 */
133 		bool loopback = false;
134 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
135 			loopback = true;
136 #if IS_ENABLED(CONFIG_IPV6)
137 		if (tw->tw_family == AF_INET6) {
138 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
139 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
140 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
141 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
142 				loopback = true;
143 		} else
144 #endif
145 		{
146 			if (ipv4_is_loopback(tw->tw_daddr) ||
147 			    ipv4_is_loopback(tw->tw_rcv_saddr))
148 				loopback = true;
149 		}
150 		if (!loopback)
151 			reuse = 0;
152 	}
153 
154 	/* With PAWS, it is safe from the viewpoint
155 	   of data integrity. Even without PAWS it is safe provided sequence
156 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
157 
158 	   Actually, the idea is close to VJ's one, only timestamp cache is
159 	   held not per host, but per port pair and TW bucket is used as state
160 	   holder.
161 
162 	   If TW bucket has been already destroyed we fall back to VJ's scheme
163 	   and use initial timestamp retrieved from peer table.
164 	 */
165 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
166 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
167 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
168 	if (ts_recent_stamp &&
169 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
170 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
171 		 * and releasing the bucket lock.
172 		 */
173 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
174 			return 0;
175 
176 		/* In case of repair and re-using TIME-WAIT sockets we still
177 		 * want to be sure that it is safe as above but honor the
178 		 * sequence numbers and time stamps set as part of the repair
179 		 * process.
180 		 *
181 		 * Without this check re-using a TIME-WAIT socket with TCP
182 		 * repair would accumulate a -1 on the repair assigned
183 		 * sequence number. The first time it is reused the sequence
184 		 * is -1, the second time -2, etc. This fixes that issue
185 		 * without appearing to create any others.
186 		 */
187 		if (likely(!tp->repair)) {
188 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
189 
190 			if (!seq)
191 				seq = 1;
192 			WRITE_ONCE(tp->write_seq, seq);
193 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
194 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
195 		}
196 
197 		return 1;
198 	}
199 
200 	return 0;
201 }
202 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
203 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)204 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
205 			      int addr_len)
206 {
207 	/* This check is replicated from tcp_v4_connect() and intended to
208 	 * prevent BPF program called below from accessing bytes that are out
209 	 * of the bound specified by user in addr_len.
210 	 */
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	sock_owned_by_me(sk);
215 
216 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
217 }
218 
219 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)220 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
221 {
222 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
223 	struct inet_timewait_death_row *tcp_death_row;
224 	struct inet_sock *inet = inet_sk(sk);
225 	struct tcp_sock *tp = tcp_sk(sk);
226 	struct ip_options_rcu *inet_opt;
227 	struct net *net = sock_net(sk);
228 	__be16 orig_sport, orig_dport;
229 	__be32 daddr, nexthop;
230 	struct flowi4 *fl4;
231 	struct rtable *rt;
232 	int err;
233 
234 	if (addr_len < sizeof(struct sockaddr_in))
235 		return -EINVAL;
236 
237 	if (usin->sin_family != AF_INET)
238 		return -EAFNOSUPPORT;
239 
240 	nexthop = daddr = usin->sin_addr.s_addr;
241 	inet_opt = rcu_dereference_protected(inet->inet_opt,
242 					     lockdep_sock_is_held(sk));
243 	if (inet_opt && inet_opt->opt.srr) {
244 		if (!daddr)
245 			return -EINVAL;
246 		nexthop = inet_opt->opt.faddr;
247 	}
248 
249 	orig_sport = inet->inet_sport;
250 	orig_dport = usin->sin_port;
251 	fl4 = &inet->cork.fl.u.ip4;
252 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
253 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
254 			      orig_dport, sk);
255 	if (IS_ERR(rt)) {
256 		err = PTR_ERR(rt);
257 		if (err == -ENETUNREACH)
258 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
259 		return err;
260 	}
261 
262 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
263 		ip_rt_put(rt);
264 		return -ENETUNREACH;
265 	}
266 
267 	if (!inet_opt || !inet_opt->opt.srr)
268 		daddr = fl4->daddr;
269 
270 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
271 
272 	if (!inet->inet_saddr) {
273 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
274 		if (err) {
275 			ip_rt_put(rt);
276 			return err;
277 		}
278 	} else {
279 		sk_rcv_saddr_set(sk, inet->inet_saddr);
280 	}
281 
282 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
283 		/* Reset inherited state */
284 		tp->rx_opt.ts_recent	   = 0;
285 		tp->rx_opt.ts_recent_stamp = 0;
286 		if (likely(!tp->repair))
287 			WRITE_ONCE(tp->write_seq, 0);
288 	}
289 
290 	inet->inet_dport = usin->sin_port;
291 	sk_daddr_set(sk, daddr);
292 
293 	inet_csk(sk)->icsk_ext_hdr_len = 0;
294 	if (inet_opt)
295 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
296 
297 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
298 
299 	/* Socket identity is still unknown (sport may be zero).
300 	 * However we set state to SYN-SENT and not releasing socket
301 	 * lock select source port, enter ourselves into the hash tables and
302 	 * complete initialization after this.
303 	 */
304 	tcp_set_state(sk, TCP_SYN_SENT);
305 	err = inet_hash_connect(tcp_death_row, sk);
306 	if (err)
307 		goto failure;
308 
309 	sk_set_txhash(sk);
310 
311 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
312 			       inet->inet_sport, inet->inet_dport, sk);
313 	if (IS_ERR(rt)) {
314 		err = PTR_ERR(rt);
315 		rt = NULL;
316 		goto failure;
317 	}
318 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
319 	/* OK, now commit destination to socket.  */
320 	sk->sk_gso_type = SKB_GSO_TCPV4;
321 	sk_setup_caps(sk, &rt->dst);
322 	rt = NULL;
323 
324 	if (likely(!tp->repair)) {
325 		if (!tp->write_seq)
326 			WRITE_ONCE(tp->write_seq,
327 				   secure_tcp_seq(inet->inet_saddr,
328 						  inet->inet_daddr,
329 						  inet->inet_sport,
330 						  usin->sin_port));
331 		WRITE_ONCE(tp->tsoffset,
332 			   secure_tcp_ts_off(net, inet->inet_saddr,
333 					     inet->inet_daddr));
334 	}
335 
336 	atomic_set(&inet->inet_id, get_random_u16());
337 
338 	if (tcp_fastopen_defer_connect(sk, &err))
339 		return err;
340 	if (err)
341 		goto failure;
342 
343 	err = tcp_connect(sk);
344 
345 	if (err)
346 		goto failure;
347 
348 	return 0;
349 
350 failure:
351 	/*
352 	 * This unhashes the socket and releases the local port,
353 	 * if necessary.
354 	 */
355 	tcp_set_state(sk, TCP_CLOSE);
356 	inet_bhash2_reset_saddr(sk);
357 	ip_rt_put(rt);
358 	sk->sk_route_caps = 0;
359 	inet->inet_dport = 0;
360 	return err;
361 }
362 EXPORT_IPV6_MOD(tcp_v4_connect);
363 
364 /*
365  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
366  * It can be called through tcp_release_cb() if socket was owned by user
367  * at the time tcp_v4_err() was called to handle ICMP message.
368  */
tcp_v4_mtu_reduced(struct sock * sk)369 void tcp_v4_mtu_reduced(struct sock *sk)
370 {
371 	struct inet_sock *inet = inet_sk(sk);
372 	struct dst_entry *dst;
373 	u32 mtu;
374 
375 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
376 		return;
377 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
378 	dst = inet_csk_update_pmtu(sk, mtu);
379 	if (!dst)
380 		return;
381 
382 	/* Something is about to be wrong... Remember soft error
383 	 * for the case, if this connection will not able to recover.
384 	 */
385 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
386 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
387 
388 	mtu = dst_mtu(dst);
389 
390 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
391 	    ip_sk_accept_pmtu(sk) &&
392 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
393 		tcp_sync_mss(sk, mtu);
394 
395 		/* Resend the TCP packet because it's
396 		 * clear that the old packet has been
397 		 * dropped. This is the new "fast" path mtu
398 		 * discovery.
399 		 */
400 		tcp_simple_retransmit(sk);
401 	} /* else let the usual retransmit timer handle it */
402 }
403 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
404 
do_redirect(struct sk_buff * skb,struct sock * sk)405 static void do_redirect(struct sk_buff *skb, struct sock *sk)
406 {
407 	struct dst_entry *dst = __sk_dst_check(sk, 0);
408 
409 	if (dst)
410 		dst->ops->redirect(dst, sk, skb);
411 }
412 
413 
414 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)415 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
416 {
417 	struct request_sock *req = inet_reqsk(sk);
418 	struct net *net = sock_net(sk);
419 
420 	/* ICMPs are not backlogged, hence we cannot get
421 	 * an established socket here.
422 	 */
423 	if (seq != tcp_rsk(req)->snt_isn) {
424 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
425 	} else if (abort) {
426 		/*
427 		 * Still in SYN_RECV, just remove it silently.
428 		 * There is no good way to pass the error to the newly
429 		 * created socket, and POSIX does not want network
430 		 * errors returned from accept().
431 		 */
432 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
433 		tcp_listendrop(req->rsk_listener);
434 	}
435 	reqsk_put(req);
436 }
437 EXPORT_IPV6_MOD(tcp_req_err);
438 
439 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)440 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
441 {
442 	struct inet_connection_sock *icsk = inet_csk(sk);
443 	struct tcp_sock *tp = tcp_sk(sk);
444 	struct sk_buff *skb;
445 	s32 remaining;
446 	u32 delta_us;
447 
448 	if (sock_owned_by_user(sk))
449 		return;
450 
451 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
452 	    !icsk->icsk_backoff)
453 		return;
454 
455 	skb = tcp_rtx_queue_head(sk);
456 	if (WARN_ON_ONCE(!skb))
457 		return;
458 
459 	icsk->icsk_backoff--;
460 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
461 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
462 
463 	tcp_mstamp_refresh(tp);
464 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
465 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
466 
467 	if (remaining > 0) {
468 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
469 	} else {
470 		/* RTO revert clocked out retransmission.
471 		 * Will retransmit now.
472 		 */
473 		tcp_retransmit_timer(sk);
474 	}
475 }
476 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
477 
478 /*
479  * This routine is called by the ICMP module when it gets some
480  * sort of error condition.  If err < 0 then the socket should
481  * be closed and the error returned to the user.  If err > 0
482  * it's just the icmp type << 8 | icmp code.  After adjustment
483  * header points to the first 8 bytes of the tcp header.  We need
484  * to find the appropriate port.
485  *
486  * The locking strategy used here is very "optimistic". When
487  * someone else accesses the socket the ICMP is just dropped
488  * and for some paths there is no check at all.
489  * A more general error queue to queue errors for later handling
490  * is probably better.
491  *
492  */
493 
tcp_v4_err(struct sk_buff * skb,u32 info)494 int tcp_v4_err(struct sk_buff *skb, u32 info)
495 {
496 	const struct iphdr *iph = (const struct iphdr *)skb->data;
497 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
498 	struct net *net = dev_net_rcu(skb->dev);
499 	const int type = icmp_hdr(skb)->type;
500 	const int code = icmp_hdr(skb)->code;
501 	struct request_sock *fastopen;
502 	struct tcp_sock *tp;
503 	u32 seq, snd_una;
504 	struct sock *sk;
505 	int err;
506 
507 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
508 				       iph->daddr, th->dest, iph->saddr,
509 				       ntohs(th->source), inet_iif(skb), 0);
510 	if (!sk) {
511 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
512 		return -ENOENT;
513 	}
514 	if (sk->sk_state == TCP_TIME_WAIT) {
515 		/* To increase the counter of ignored icmps for TCP-AO */
516 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
517 		inet_twsk_put(inet_twsk(sk));
518 		return 0;
519 	}
520 	seq = ntohl(th->seq);
521 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
522 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
523 				     type == ICMP_TIME_EXCEEDED ||
524 				     (type == ICMP_DEST_UNREACH &&
525 				      (code == ICMP_NET_UNREACH ||
526 				       code == ICMP_HOST_UNREACH)));
527 		return 0;
528 	}
529 
530 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
531 		sock_put(sk);
532 		return 0;
533 	}
534 
535 	bh_lock_sock(sk);
536 	/* If too many ICMPs get dropped on busy
537 	 * servers this needs to be solved differently.
538 	 * We do take care of PMTU discovery (RFC1191) special case :
539 	 * we can receive locally generated ICMP messages while socket is held.
540 	 */
541 	if (sock_owned_by_user(sk)) {
542 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
543 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
544 	}
545 	if (sk->sk_state == TCP_CLOSE)
546 		goto out;
547 
548 	if (static_branch_unlikely(&ip4_min_ttl)) {
549 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
550 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
551 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
552 			goto out;
553 		}
554 	}
555 
556 	tp = tcp_sk(sk);
557 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
558 	fastopen = rcu_dereference(tp->fastopen_rsk);
559 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
560 	if (sk->sk_state != TCP_LISTEN &&
561 	    !between(seq, snd_una, tp->snd_nxt)) {
562 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
563 		goto out;
564 	}
565 
566 	switch (type) {
567 	case ICMP_REDIRECT:
568 		if (!sock_owned_by_user(sk))
569 			do_redirect(skb, sk);
570 		goto out;
571 	case ICMP_SOURCE_QUENCH:
572 		/* Just silently ignore these. */
573 		goto out;
574 	case ICMP_PARAMETERPROB:
575 		err = EPROTO;
576 		break;
577 	case ICMP_DEST_UNREACH:
578 		if (code > NR_ICMP_UNREACH)
579 			goto out;
580 
581 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
582 			/* We are not interested in TCP_LISTEN and open_requests
583 			 * (SYN-ACKs send out by Linux are always <576bytes so
584 			 * they should go through unfragmented).
585 			 */
586 			if (sk->sk_state == TCP_LISTEN)
587 				goto out;
588 
589 			WRITE_ONCE(tp->mtu_info, info);
590 			if (!sock_owned_by_user(sk)) {
591 				tcp_v4_mtu_reduced(sk);
592 			} else {
593 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
594 					sock_hold(sk);
595 			}
596 			goto out;
597 		}
598 
599 		err = icmp_err_convert[code].errno;
600 		/* check if this ICMP message allows revert of backoff.
601 		 * (see RFC 6069)
602 		 */
603 		if (!fastopen &&
604 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
605 			tcp_ld_RTO_revert(sk, seq);
606 		break;
607 	case ICMP_TIME_EXCEEDED:
608 		err = EHOSTUNREACH;
609 		break;
610 	default:
611 		goto out;
612 	}
613 
614 	switch (sk->sk_state) {
615 	case TCP_SYN_SENT:
616 	case TCP_SYN_RECV:
617 		/* Only in fast or simultaneous open. If a fast open socket is
618 		 * already accepted it is treated as a connected one below.
619 		 */
620 		if (fastopen && !fastopen->sk)
621 			break;
622 
623 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
624 
625 		if (!sock_owned_by_user(sk))
626 			tcp_done_with_error(sk, err);
627 		else
628 			WRITE_ONCE(sk->sk_err_soft, err);
629 		goto out;
630 	}
631 
632 	/* If we've already connected we will keep trying
633 	 * until we time out, or the user gives up.
634 	 *
635 	 * rfc1122 4.2.3.9 allows to consider as hard errors
636 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
637 	 * but it is obsoleted by pmtu discovery).
638 	 *
639 	 * Note, that in modern internet, where routing is unreliable
640 	 * and in each dark corner broken firewalls sit, sending random
641 	 * errors ordered by their masters even this two messages finally lose
642 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
643 	 *
644 	 * Now we are in compliance with RFCs.
645 	 *							--ANK (980905)
646 	 */
647 
648 	if (!sock_owned_by_user(sk) &&
649 	    inet_test_bit(RECVERR, sk)) {
650 		WRITE_ONCE(sk->sk_err, err);
651 		sk_error_report(sk);
652 	} else	{ /* Only an error on timeout */
653 		WRITE_ONCE(sk->sk_err_soft, err);
654 	}
655 
656 out:
657 	bh_unlock_sock(sk);
658 	sock_put(sk);
659 	return 0;
660 }
661 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)662 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
663 {
664 	struct tcphdr *th = tcp_hdr(skb);
665 
666 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
667 	skb->csum_start = skb_transport_header(skb) - skb->head;
668 	skb->csum_offset = offsetof(struct tcphdr, check);
669 }
670 
671 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)672 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
673 {
674 	const struct inet_sock *inet = inet_sk(sk);
675 
676 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
677 }
678 EXPORT_IPV6_MOD(tcp_v4_send_check);
679 
680 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
681 
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])682 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
683 				 const struct tcp_ao_hdr *aoh,
684 				 struct ip_reply_arg *arg, struct tcphdr *reply,
685 				 __be32 reply_options[REPLY_OPTIONS_LEN])
686 {
687 #ifdef CONFIG_TCP_AO
688 	int sdif = tcp_v4_sdif(skb);
689 	int dif = inet_iif(skb);
690 	int l3index = sdif ? dif : 0;
691 	bool allocated_traffic_key;
692 	struct tcp_ao_key *key;
693 	char *traffic_key;
694 	bool drop = true;
695 	u32 ao_sne = 0;
696 	u8 keyid;
697 
698 	rcu_read_lock();
699 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
700 				 &key, &traffic_key, &allocated_traffic_key,
701 				 &keyid, &ao_sne))
702 		goto out;
703 
704 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
705 				 (aoh->rnext_keyid << 8) | keyid);
706 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
707 	reply->doff = arg->iov[0].iov_len / 4;
708 
709 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
710 			    key, traffic_key,
711 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
712 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
713 			    reply, ao_sne))
714 		goto out;
715 	drop = false;
716 out:
717 	rcu_read_unlock();
718 	if (allocated_traffic_key)
719 		kfree(traffic_key);
720 	return drop;
721 #else
722 	return true;
723 #endif
724 }
725 
726 /*
727  *	This routine will send an RST to the other tcp.
728  *
729  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
730  *		      for reset.
731  *	Answer: if a packet caused RST, it is not for a socket
732  *		existing in our system, if it is matched to a socket,
733  *		it is just duplicate segment or bug in other side's TCP.
734  *		So that we build reply only basing on parameters
735  *		arrived with segment.
736  *	Exception: precedence violation. We do not implement it in any case.
737  */
738 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)739 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
740 			      enum sk_rst_reason reason)
741 {
742 	const struct tcphdr *th = tcp_hdr(skb);
743 	struct {
744 		struct tcphdr th;
745 		__be32 opt[REPLY_OPTIONS_LEN];
746 	} rep;
747 	const __u8 *md5_hash_location = NULL;
748 	const struct tcp_ao_hdr *aoh;
749 	struct ip_reply_arg arg;
750 #ifdef CONFIG_TCP_MD5SIG
751 	struct tcp_md5sig_key *key = NULL;
752 	unsigned char newhash[16];
753 	struct sock *sk1 = NULL;
754 	int genhash;
755 #endif
756 	u64 transmit_time = 0;
757 	struct sock *ctl_sk;
758 	struct net *net;
759 	u32 txhash = 0;
760 
761 	/* Never send a reset in response to a reset. */
762 	if (th->rst)
763 		return;
764 
765 	/* If sk not NULL, it means we did a successful lookup and incoming
766 	 * route had to be correct. prequeue might have dropped our dst.
767 	 */
768 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
769 		return;
770 
771 	/* Swap the send and the receive. */
772 	memset(&rep, 0, sizeof(rep));
773 	rep.th.dest   = th->source;
774 	rep.th.source = th->dest;
775 	rep.th.doff   = sizeof(struct tcphdr) / 4;
776 	rep.th.rst    = 1;
777 
778 	if (th->ack) {
779 		rep.th.seq = th->ack_seq;
780 	} else {
781 		rep.th.ack = 1;
782 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
783 				       skb->len - (th->doff << 2));
784 	}
785 
786 	memset(&arg, 0, sizeof(arg));
787 	arg.iov[0].iov_base = (unsigned char *)&rep;
788 	arg.iov[0].iov_len  = sizeof(rep.th);
789 
790 	net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev);
791 
792 	/* Invalid TCP option size or twice included auth */
793 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
794 		return;
795 
796 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
797 		return;
798 
799 #ifdef CONFIG_TCP_MD5SIG
800 	rcu_read_lock();
801 	if (sk && sk_fullsock(sk)) {
802 		const union tcp_md5_addr *addr;
803 		int l3index;
804 
805 		/* sdif set, means packet ingressed via a device
806 		 * in an L3 domain and inet_iif is set to it.
807 		 */
808 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
809 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
810 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
811 	} else if (md5_hash_location) {
812 		const union tcp_md5_addr *addr;
813 		int sdif = tcp_v4_sdif(skb);
814 		int dif = inet_iif(skb);
815 		int l3index;
816 
817 		/*
818 		 * active side is lost. Try to find listening socket through
819 		 * source port, and then find md5 key through listening socket.
820 		 * we are not loose security here:
821 		 * Incoming packet is checked with md5 hash with finding key,
822 		 * no RST generated if md5 hash doesn't match.
823 		 */
824 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
825 					     NULL, 0, ip_hdr(skb)->saddr,
826 					     th->source, ip_hdr(skb)->daddr,
827 					     ntohs(th->source), dif, sdif);
828 		/* don't send rst if it can't find key */
829 		if (!sk1)
830 			goto out;
831 
832 		/* sdif set, means packet ingressed via a device
833 		 * in an L3 domain and dif is set to it.
834 		 */
835 		l3index = sdif ? dif : 0;
836 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
837 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
838 		if (!key)
839 			goto out;
840 
841 
842 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
843 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
844 			goto out;
845 
846 	}
847 
848 	if (key) {
849 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
850 				   (TCPOPT_NOP << 16) |
851 				   (TCPOPT_MD5SIG << 8) |
852 				   TCPOLEN_MD5SIG);
853 		/* Update length and the length the header thinks exists */
854 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
855 		rep.th.doff = arg.iov[0].iov_len / 4;
856 
857 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
858 				     key, ip_hdr(skb)->saddr,
859 				     ip_hdr(skb)->daddr, &rep.th);
860 	}
861 #endif
862 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
863 	if (rep.opt[0] == 0) {
864 		__be32 mrst = mptcp_reset_option(skb);
865 
866 		if (mrst) {
867 			rep.opt[0] = mrst;
868 			arg.iov[0].iov_len += sizeof(mrst);
869 			rep.th.doff = arg.iov[0].iov_len / 4;
870 		}
871 	}
872 
873 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
874 				      ip_hdr(skb)->saddr, /* XXX */
875 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
876 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
877 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
878 
879 	/* When socket is gone, all binding information is lost.
880 	 * routing might fail in this case. No choice here, if we choose to force
881 	 * input interface, we will misroute in case of asymmetric route.
882 	 */
883 	if (sk)
884 		arg.bound_dev_if = sk->sk_bound_dev_if;
885 
886 	trace_tcp_send_reset(sk, skb, reason);
887 
888 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
889 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
890 
891 	/* ECN bits of TW reset are cleared */
892 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
893 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
894 	local_bh_disable();
895 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
896 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
897 
898 	sock_net_set(ctl_sk, net);
899 	if (sk) {
900 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
901 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
902 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
903 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
904 		transmit_time = tcp_transmit_time(sk);
905 		xfrm_sk_clone_policy(ctl_sk, sk);
906 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
907 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
908 	} else {
909 		ctl_sk->sk_mark = 0;
910 		ctl_sk->sk_priority = 0;
911 	}
912 	ip_send_unicast_reply(ctl_sk, sk,
913 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
914 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
915 			      &arg, arg.iov[0].iov_len,
916 			      transmit_time, txhash);
917 
918 	xfrm_sk_free_policy(ctl_sk);
919 	sock_net_set(ctl_sk, &init_net);
920 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
921 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
922 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
923 	local_bh_enable();
924 
925 #ifdef CONFIG_TCP_MD5SIG
926 out:
927 	rcu_read_unlock();
928 #endif
929 }
930 
931 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
932    outside socket context is ugly, certainly. What can I do?
933  */
934 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)935 static void tcp_v4_send_ack(const struct sock *sk,
936 			    struct sk_buff *skb, u32 seq, u32 ack,
937 			    u32 win, u32 tsval, u32 tsecr, int oif,
938 			    struct tcp_key *key,
939 			    int reply_flags, u8 tos, u32 txhash)
940 {
941 	const struct tcphdr *th = tcp_hdr(skb);
942 	struct {
943 		struct tcphdr th;
944 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
945 	} rep;
946 	struct net *net = sock_net(sk);
947 	struct ip_reply_arg arg;
948 	struct sock *ctl_sk;
949 	u64 transmit_time;
950 
951 	memset(&rep.th, 0, sizeof(struct tcphdr));
952 	memset(&arg, 0, sizeof(arg));
953 
954 	arg.iov[0].iov_base = (unsigned char *)&rep;
955 	arg.iov[0].iov_len  = sizeof(rep.th);
956 	if (tsecr) {
957 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
958 				   (TCPOPT_TIMESTAMP << 8) |
959 				   TCPOLEN_TIMESTAMP);
960 		rep.opt[1] = htonl(tsval);
961 		rep.opt[2] = htonl(tsecr);
962 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
963 	}
964 
965 	/* Swap the send and the receive. */
966 	rep.th.dest    = th->source;
967 	rep.th.source  = th->dest;
968 	rep.th.doff    = arg.iov[0].iov_len / 4;
969 	rep.th.seq     = htonl(seq);
970 	rep.th.ack_seq = htonl(ack);
971 	rep.th.ack     = 1;
972 	rep.th.window  = htons(win);
973 
974 #ifdef CONFIG_TCP_MD5SIG
975 	if (tcp_key_is_md5(key)) {
976 		int offset = (tsecr) ? 3 : 0;
977 
978 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
979 					  (TCPOPT_NOP << 16) |
980 					  (TCPOPT_MD5SIG << 8) |
981 					  TCPOLEN_MD5SIG);
982 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
983 		rep.th.doff = arg.iov[0].iov_len/4;
984 
985 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
986 				    key->md5_key, ip_hdr(skb)->saddr,
987 				    ip_hdr(skb)->daddr, &rep.th);
988 	}
989 #endif
990 #ifdef CONFIG_TCP_AO
991 	if (tcp_key_is_ao(key)) {
992 		int offset = (tsecr) ? 3 : 0;
993 
994 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
995 					  (tcp_ao_len(key->ao_key) << 16) |
996 					  (key->ao_key->sndid << 8) |
997 					  key->rcv_next);
998 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
999 		rep.th.doff = arg.iov[0].iov_len / 4;
1000 
1001 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1002 				key->ao_key, key->traffic_key,
1003 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1004 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1005 				&rep.th, key->sne);
1006 	}
1007 #endif
1008 	arg.flags = reply_flags;
1009 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1010 				      ip_hdr(skb)->saddr, /* XXX */
1011 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1012 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1013 	if (oif)
1014 		arg.bound_dev_if = oif;
1015 	arg.tos = tos;
1016 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1017 	local_bh_disable();
1018 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1019 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1020 	sock_net_set(ctl_sk, net);
1021 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1022 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1023 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1024 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1025 	transmit_time = tcp_transmit_time(sk);
1026 	ip_send_unicast_reply(ctl_sk, sk,
1027 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1028 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1029 			      &arg, arg.iov[0].iov_len,
1030 			      transmit_time, txhash);
1031 
1032 	sock_net_set(ctl_sk, &init_net);
1033 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1034 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1035 	local_bh_enable();
1036 }
1037 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb,enum tcp_tw_status tw_status)1038 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1039 				enum tcp_tw_status tw_status)
1040 {
1041 	struct inet_timewait_sock *tw = inet_twsk(sk);
1042 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1043 	struct tcp_key key = {};
1044 	u8 tos = tw->tw_tos;
1045 
1046 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1047 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1048 	 * being placed in a different service queues (Classic rather than L4S)
1049 	 */
1050 	if (tw_status == TCP_TW_ACK_OOW)
1051 		tos &= ~INET_ECN_MASK;
1052 
1053 #ifdef CONFIG_TCP_AO
1054 	struct tcp_ao_info *ao_info;
1055 
1056 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1057 		/* FIXME: the segment to-be-acked is not verified yet */
1058 		ao_info = rcu_dereference(tcptw->ao_info);
1059 		if (ao_info) {
1060 			const struct tcp_ao_hdr *aoh;
1061 
1062 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1063 				inet_twsk_put(tw);
1064 				return;
1065 			}
1066 
1067 			if (aoh)
1068 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1069 								    aoh->rnext_keyid, -1);
1070 		}
1071 	}
1072 	if (key.ao_key) {
1073 		struct tcp_ao_key *rnext_key;
1074 
1075 		key.traffic_key = snd_other_key(key.ao_key);
1076 		key.sne = READ_ONCE(ao_info->snd_sne);
1077 		rnext_key = READ_ONCE(ao_info->rnext_key);
1078 		key.rcv_next = rnext_key->rcvid;
1079 		key.type = TCP_KEY_AO;
1080 #else
1081 	if (0) {
1082 #endif
1083 	} else if (static_branch_tcp_md5()) {
1084 		key.md5_key = tcp_twsk_md5_key(tcptw);
1085 		if (key.md5_key)
1086 			key.type = TCP_KEY_MD5;
1087 	}
1088 
1089 	tcp_v4_send_ack(sk, skb,
1090 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1091 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1092 			tcp_tw_tsval(tcptw),
1093 			READ_ONCE(tcptw->tw_ts_recent),
1094 			tw->tw_bound_dev_if, &key,
1095 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1096 			tos,
1097 			tw->tw_txhash);
1098 
1099 	inet_twsk_put(tw);
1100 }
1101 
1102 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1103 				  struct request_sock *req)
1104 {
1105 	struct tcp_key key = {};
1106 
1107 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1108 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1109 	 */
1110 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1111 					     tcp_sk(sk)->snd_nxt;
1112 
1113 #ifdef CONFIG_TCP_AO
1114 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1115 	    tcp_rsk_used_ao(req)) {
1116 		const union tcp_md5_addr *addr;
1117 		const struct tcp_ao_hdr *aoh;
1118 		int l3index;
1119 
1120 		/* Invalid TCP option size or twice included auth */
1121 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1122 			return;
1123 		if (!aoh)
1124 			return;
1125 
1126 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1127 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1128 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1129 					      aoh->rnext_keyid, -1);
1130 		if (unlikely(!key.ao_key)) {
1131 			/* Send ACK with any matching MKT for the peer */
1132 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1133 			/* Matching key disappeared (user removed the key?)
1134 			 * let the handshake timeout.
1135 			 */
1136 			if (!key.ao_key) {
1137 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1138 						     addr,
1139 						     ntohs(tcp_hdr(skb)->source),
1140 						     &ip_hdr(skb)->daddr,
1141 						     ntohs(tcp_hdr(skb)->dest));
1142 				return;
1143 			}
1144 		}
1145 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1146 		if (!key.traffic_key)
1147 			return;
1148 
1149 		key.type = TCP_KEY_AO;
1150 		key.rcv_next = aoh->keyid;
1151 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1152 #else
1153 	if (0) {
1154 #endif
1155 	} else if (static_branch_tcp_md5()) {
1156 		const union tcp_md5_addr *addr;
1157 		int l3index;
1158 
1159 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1160 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1161 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1162 		if (key.md5_key)
1163 			key.type = TCP_KEY_MD5;
1164 	}
1165 
1166 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1167 	tcp_v4_send_ack(sk, skb, seq,
1168 			tcp_rsk(req)->rcv_nxt,
1169 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1170 			tcp_rsk_tsval(tcp_rsk(req)),
1171 			req->ts_recent,
1172 			0, &key,
1173 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1174 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1175 			READ_ONCE(tcp_rsk(req)->txhash));
1176 	if (tcp_key_is_ao(&key))
1177 		kfree(key.traffic_key);
1178 }
1179 
1180 /*
1181  *	Send a SYN-ACK after having received a SYN.
1182  *	This still operates on a request_sock only, not on a big
1183  *	socket.
1184  */
1185 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1186 			      struct flowi *fl,
1187 			      struct request_sock *req,
1188 			      struct tcp_fastopen_cookie *foc,
1189 			      enum tcp_synack_type synack_type,
1190 			      struct sk_buff *syn_skb)
1191 {
1192 	const struct inet_request_sock *ireq = inet_rsk(req);
1193 	struct flowi4 fl4;
1194 	int err = -1;
1195 	struct sk_buff *skb;
1196 	u8 tos;
1197 
1198 	/* First, grab a route. */
1199 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1200 		return -1;
1201 
1202 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1203 
1204 	if (skb) {
1205 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1206 
1207 		tos = READ_ONCE(inet_sk(sk)->tos);
1208 
1209 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1210 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1211 			      (tos & INET_ECN_MASK);
1212 
1213 		if (!INET_ECN_is_capable(tos) &&
1214 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1215 			tos |= INET_ECN_ECT_0;
1216 
1217 		rcu_read_lock();
1218 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1219 					    ireq->ir_rmt_addr,
1220 					    rcu_dereference(ireq->ireq_opt),
1221 					    tos);
1222 		rcu_read_unlock();
1223 		err = net_xmit_eval(err);
1224 	}
1225 
1226 	return err;
1227 }
1228 
1229 /*
1230  *	IPv4 request_sock destructor.
1231  */
1232 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1233 {
1234 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1235 }
1236 
1237 #ifdef CONFIG_TCP_MD5SIG
1238 /*
1239  * RFC2385 MD5 checksumming requires a mapping of
1240  * IP address->MD5 Key.
1241  * We need to maintain these in the sk structure.
1242  */
1243 
1244 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1245 EXPORT_IPV6_MOD(tcp_md5_needed);
1246 
1247 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1248 {
1249 	if (!old)
1250 		return true;
1251 
1252 	/* l3index always overrides non-l3index */
1253 	if (old->l3index && new->l3index == 0)
1254 		return false;
1255 	if (old->l3index == 0 && new->l3index)
1256 		return true;
1257 
1258 	return old->prefixlen < new->prefixlen;
1259 }
1260 
1261 /* Find the Key structure for an address.  */
1262 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1263 					   const union tcp_md5_addr *addr,
1264 					   int family, bool any_l3index)
1265 {
1266 	const struct tcp_sock *tp = tcp_sk(sk);
1267 	struct tcp_md5sig_key *key;
1268 	const struct tcp_md5sig_info *md5sig;
1269 	__be32 mask;
1270 	struct tcp_md5sig_key *best_match = NULL;
1271 	bool match;
1272 
1273 	/* caller either holds rcu_read_lock() or socket lock */
1274 	md5sig = rcu_dereference_check(tp->md5sig_info,
1275 				       lockdep_sock_is_held(sk));
1276 	if (!md5sig)
1277 		return NULL;
1278 
1279 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1280 				 lockdep_sock_is_held(sk)) {
1281 		if (key->family != family)
1282 			continue;
1283 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1284 		    key->l3index != l3index)
1285 			continue;
1286 		if (family == AF_INET) {
1287 			mask = inet_make_mask(key->prefixlen);
1288 			match = (key->addr.a4.s_addr & mask) ==
1289 				(addr->a4.s_addr & mask);
1290 #if IS_ENABLED(CONFIG_IPV6)
1291 		} else if (family == AF_INET6) {
1292 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1293 						  key->prefixlen);
1294 #endif
1295 		} else {
1296 			match = false;
1297 		}
1298 
1299 		if (match && better_md5_match(best_match, key))
1300 			best_match = key;
1301 	}
1302 	return best_match;
1303 }
1304 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1305 
1306 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1307 						      const union tcp_md5_addr *addr,
1308 						      int family, u8 prefixlen,
1309 						      int l3index, u8 flags)
1310 {
1311 	const struct tcp_sock *tp = tcp_sk(sk);
1312 	struct tcp_md5sig_key *key;
1313 	unsigned int size = sizeof(struct in_addr);
1314 	const struct tcp_md5sig_info *md5sig;
1315 
1316 	/* caller either holds rcu_read_lock() or socket lock */
1317 	md5sig = rcu_dereference_check(tp->md5sig_info,
1318 				       lockdep_sock_is_held(sk));
1319 	if (!md5sig)
1320 		return NULL;
1321 #if IS_ENABLED(CONFIG_IPV6)
1322 	if (family == AF_INET6)
1323 		size = sizeof(struct in6_addr);
1324 #endif
1325 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1326 				 lockdep_sock_is_held(sk)) {
1327 		if (key->family != family)
1328 			continue;
1329 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1330 			continue;
1331 		if (key->l3index != l3index)
1332 			continue;
1333 		if (!memcmp(&key->addr, addr, size) &&
1334 		    key->prefixlen == prefixlen)
1335 			return key;
1336 	}
1337 	return NULL;
1338 }
1339 
1340 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1341 					 const struct sock *addr_sk)
1342 {
1343 	const union tcp_md5_addr *addr;
1344 	int l3index;
1345 
1346 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1347 						 addr_sk->sk_bound_dev_if);
1348 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1349 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1350 }
1351 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1352 
1353 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1354 {
1355 	struct tcp_sock *tp = tcp_sk(sk);
1356 	struct tcp_md5sig_info *md5sig;
1357 
1358 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1359 	if (!md5sig)
1360 		return -ENOMEM;
1361 
1362 	sk_gso_disable(sk);
1363 	INIT_HLIST_HEAD(&md5sig->head);
1364 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1365 	return 0;
1366 }
1367 
1368 /* This can be called on a newly created socket, from other files */
1369 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1370 			    int family, u8 prefixlen, int l3index, u8 flags,
1371 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1372 {
1373 	/* Add Key to the list */
1374 	struct tcp_md5sig_key *key;
1375 	struct tcp_sock *tp = tcp_sk(sk);
1376 	struct tcp_md5sig_info *md5sig;
1377 
1378 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1379 	if (key) {
1380 		/* Pre-existing entry - just update that one.
1381 		 * Note that the key might be used concurrently.
1382 		 * data_race() is telling kcsan that we do not care of
1383 		 * key mismatches, since changing MD5 key on live flows
1384 		 * can lead to packet drops.
1385 		 */
1386 		data_race(memcpy(key->key, newkey, newkeylen));
1387 
1388 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1389 		 * Also note that a reader could catch new key->keylen value
1390 		 * but old key->key[], this is the reason we use __GFP_ZERO
1391 		 * at sock_kmalloc() time below these lines.
1392 		 */
1393 		WRITE_ONCE(key->keylen, newkeylen);
1394 
1395 		return 0;
1396 	}
1397 
1398 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1399 					   lockdep_sock_is_held(sk));
1400 
1401 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1402 	if (!key)
1403 		return -ENOMEM;
1404 
1405 	memcpy(key->key, newkey, newkeylen);
1406 	key->keylen = newkeylen;
1407 	key->family = family;
1408 	key->prefixlen = prefixlen;
1409 	key->l3index = l3index;
1410 	key->flags = flags;
1411 	memcpy(&key->addr, addr,
1412 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1413 								 sizeof(struct in_addr));
1414 	hlist_add_head_rcu(&key->node, &md5sig->head);
1415 	return 0;
1416 }
1417 
1418 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1419 		   int family, u8 prefixlen, int l3index, u8 flags,
1420 		   const u8 *newkey, u8 newkeylen)
1421 {
1422 	struct tcp_sock *tp = tcp_sk(sk);
1423 
1424 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1425 		if (tcp_md5_alloc_sigpool())
1426 			return -ENOMEM;
1427 
1428 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1429 			tcp_md5_release_sigpool();
1430 			return -ENOMEM;
1431 		}
1432 
1433 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1434 			struct tcp_md5sig_info *md5sig;
1435 
1436 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1437 			rcu_assign_pointer(tp->md5sig_info, NULL);
1438 			kfree_rcu(md5sig, rcu);
1439 			tcp_md5_release_sigpool();
1440 			return -EUSERS;
1441 		}
1442 	}
1443 
1444 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1445 				newkey, newkeylen, GFP_KERNEL);
1446 }
1447 EXPORT_IPV6_MOD(tcp_md5_do_add);
1448 
1449 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1450 		     int family, u8 prefixlen, int l3index,
1451 		     struct tcp_md5sig_key *key)
1452 {
1453 	struct tcp_sock *tp = tcp_sk(sk);
1454 
1455 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1456 		tcp_md5_add_sigpool();
1457 
1458 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1459 			tcp_md5_release_sigpool();
1460 			return -ENOMEM;
1461 		}
1462 
1463 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1464 			struct tcp_md5sig_info *md5sig;
1465 
1466 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1467 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1468 			rcu_assign_pointer(tp->md5sig_info, NULL);
1469 			kfree_rcu(md5sig, rcu);
1470 			tcp_md5_release_sigpool();
1471 			return -EUSERS;
1472 		}
1473 	}
1474 
1475 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1476 				key->flags, key->key, key->keylen,
1477 				sk_gfp_mask(sk, GFP_ATOMIC));
1478 }
1479 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1480 
1481 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1482 		   u8 prefixlen, int l3index, u8 flags)
1483 {
1484 	struct tcp_md5sig_key *key;
1485 
1486 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1487 	if (!key)
1488 		return -ENOENT;
1489 	hlist_del_rcu(&key->node);
1490 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1491 	kfree_rcu(key, rcu);
1492 	return 0;
1493 }
1494 EXPORT_IPV6_MOD(tcp_md5_do_del);
1495 
1496 void tcp_clear_md5_list(struct sock *sk)
1497 {
1498 	struct tcp_sock *tp = tcp_sk(sk);
1499 	struct tcp_md5sig_key *key;
1500 	struct hlist_node *n;
1501 	struct tcp_md5sig_info *md5sig;
1502 
1503 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1504 
1505 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1506 		hlist_del_rcu(&key->node);
1507 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1508 		kfree_rcu(key, rcu);
1509 	}
1510 }
1511 
1512 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1513 				 sockptr_t optval, int optlen)
1514 {
1515 	struct tcp_md5sig cmd;
1516 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1517 	const union tcp_md5_addr *addr;
1518 	u8 prefixlen = 32;
1519 	int l3index = 0;
1520 	bool l3flag;
1521 	u8 flags;
1522 
1523 	if (optlen < sizeof(cmd))
1524 		return -EINVAL;
1525 
1526 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1527 		return -EFAULT;
1528 
1529 	if (sin->sin_family != AF_INET)
1530 		return -EINVAL;
1531 
1532 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1533 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1534 
1535 	if (optname == TCP_MD5SIG_EXT &&
1536 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1537 		prefixlen = cmd.tcpm_prefixlen;
1538 		if (prefixlen > 32)
1539 			return -EINVAL;
1540 	}
1541 
1542 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1543 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1544 		struct net_device *dev;
1545 
1546 		rcu_read_lock();
1547 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1548 		if (dev && netif_is_l3_master(dev))
1549 			l3index = dev->ifindex;
1550 
1551 		rcu_read_unlock();
1552 
1553 		/* ok to reference set/not set outside of rcu;
1554 		 * right now device MUST be an L3 master
1555 		 */
1556 		if (!dev || !l3index)
1557 			return -EINVAL;
1558 	}
1559 
1560 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1561 
1562 	if (!cmd.tcpm_keylen)
1563 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1564 
1565 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1566 		return -EINVAL;
1567 
1568 	/* Don't allow keys for peers that have a matching TCP-AO key.
1569 	 * See the comment in tcp_ao_add_cmd()
1570 	 */
1571 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1572 		return -EKEYREJECTED;
1573 
1574 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1575 			      cmd.tcpm_key, cmd.tcpm_keylen);
1576 }
1577 
1578 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1579 				   __be32 daddr, __be32 saddr,
1580 				   const struct tcphdr *th, int nbytes)
1581 {
1582 	struct tcp4_pseudohdr *bp;
1583 	struct scatterlist sg;
1584 	struct tcphdr *_th;
1585 
1586 	bp = hp->scratch;
1587 	bp->saddr = saddr;
1588 	bp->daddr = daddr;
1589 	bp->pad = 0;
1590 	bp->protocol = IPPROTO_TCP;
1591 	bp->len = cpu_to_be16(nbytes);
1592 
1593 	_th = (struct tcphdr *)(bp + 1);
1594 	memcpy(_th, th, sizeof(*th));
1595 	_th->check = 0;
1596 
1597 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1598 	ahash_request_set_crypt(hp->req, &sg, NULL,
1599 				sizeof(*bp) + sizeof(*th));
1600 	return crypto_ahash_update(hp->req);
1601 }
1602 
1603 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1604 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1605 {
1606 	struct tcp_sigpool hp;
1607 
1608 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1609 		goto clear_hash_nostart;
1610 
1611 	if (crypto_ahash_init(hp.req))
1612 		goto clear_hash;
1613 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1614 		goto clear_hash;
1615 	if (tcp_md5_hash_key(&hp, key))
1616 		goto clear_hash;
1617 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1618 	if (crypto_ahash_final(hp.req))
1619 		goto clear_hash;
1620 
1621 	tcp_sigpool_end(&hp);
1622 	return 0;
1623 
1624 clear_hash:
1625 	tcp_sigpool_end(&hp);
1626 clear_hash_nostart:
1627 	memset(md5_hash, 0, 16);
1628 	return 1;
1629 }
1630 
1631 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1632 			const struct sock *sk,
1633 			const struct sk_buff *skb)
1634 {
1635 	const struct tcphdr *th = tcp_hdr(skb);
1636 	struct tcp_sigpool hp;
1637 	__be32 saddr, daddr;
1638 
1639 	if (sk) { /* valid for establish/request sockets */
1640 		saddr = sk->sk_rcv_saddr;
1641 		daddr = sk->sk_daddr;
1642 	} else {
1643 		const struct iphdr *iph = ip_hdr(skb);
1644 		saddr = iph->saddr;
1645 		daddr = iph->daddr;
1646 	}
1647 
1648 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1649 		goto clear_hash_nostart;
1650 
1651 	if (crypto_ahash_init(hp.req))
1652 		goto clear_hash;
1653 
1654 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1655 		goto clear_hash;
1656 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1657 		goto clear_hash;
1658 	if (tcp_md5_hash_key(&hp, key))
1659 		goto clear_hash;
1660 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1661 	if (crypto_ahash_final(hp.req))
1662 		goto clear_hash;
1663 
1664 	tcp_sigpool_end(&hp);
1665 	return 0;
1666 
1667 clear_hash:
1668 	tcp_sigpool_end(&hp);
1669 clear_hash_nostart:
1670 	memset(md5_hash, 0, 16);
1671 	return 1;
1672 }
1673 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1674 
1675 #endif
1676 
1677 static void tcp_v4_init_req(struct request_sock *req,
1678 			    const struct sock *sk_listener,
1679 			    struct sk_buff *skb)
1680 {
1681 	struct inet_request_sock *ireq = inet_rsk(req);
1682 	struct net *net = sock_net(sk_listener);
1683 
1684 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1685 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1686 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1687 }
1688 
1689 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1690 					  struct sk_buff *skb,
1691 					  struct flowi *fl,
1692 					  struct request_sock *req,
1693 					  u32 tw_isn)
1694 {
1695 	tcp_v4_init_req(req, sk, skb);
1696 
1697 	if (security_inet_conn_request(sk, skb, req))
1698 		return NULL;
1699 
1700 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1701 }
1702 
1703 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1704 	.family		=	PF_INET,
1705 	.obj_size	=	sizeof(struct tcp_request_sock),
1706 	.rtx_syn_ack	=	tcp_rtx_synack,
1707 	.send_ack	=	tcp_v4_reqsk_send_ack,
1708 	.destructor	=	tcp_v4_reqsk_destructor,
1709 	.send_reset	=	tcp_v4_send_reset,
1710 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1711 };
1712 
1713 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1714 	.mss_clamp	=	TCP_MSS_DEFAULT,
1715 #ifdef CONFIG_TCP_MD5SIG
1716 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1717 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1718 #endif
1719 #ifdef CONFIG_TCP_AO
1720 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1721 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1722 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1723 #endif
1724 #ifdef CONFIG_SYN_COOKIES
1725 	.cookie_init_seq =	cookie_v4_init_sequence,
1726 #endif
1727 	.route_req	=	tcp_v4_route_req,
1728 	.init_seq	=	tcp_v4_init_seq,
1729 	.init_ts_off	=	tcp_v4_init_ts_off,
1730 	.send_synack	=	tcp_v4_send_synack,
1731 };
1732 
1733 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1734 {
1735 	/* Never answer to SYNs send to broadcast or multicast */
1736 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1737 		goto drop;
1738 
1739 	return tcp_conn_request(&tcp_request_sock_ops,
1740 				&tcp_request_sock_ipv4_ops, sk, skb);
1741 
1742 drop:
1743 	tcp_listendrop(sk);
1744 	return 0;
1745 }
1746 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1747 
1748 
1749 /*
1750  * The three way handshake has completed - we got a valid synack -
1751  * now create the new socket.
1752  */
1753 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1754 				  struct request_sock *req,
1755 				  struct dst_entry *dst,
1756 				  struct request_sock *req_unhash,
1757 				  bool *own_req)
1758 {
1759 	struct inet_request_sock *ireq;
1760 	bool found_dup_sk = false;
1761 	struct inet_sock *newinet;
1762 	struct tcp_sock *newtp;
1763 	struct sock *newsk;
1764 #ifdef CONFIG_TCP_MD5SIG
1765 	const union tcp_md5_addr *addr;
1766 	struct tcp_md5sig_key *key;
1767 	int l3index;
1768 #endif
1769 	struct ip_options_rcu *inet_opt;
1770 
1771 	if (sk_acceptq_is_full(sk))
1772 		goto exit_overflow;
1773 
1774 	newsk = tcp_create_openreq_child(sk, req, skb);
1775 	if (!newsk)
1776 		goto exit_nonewsk;
1777 
1778 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1779 	inet_sk_rx_dst_set(newsk, skb);
1780 
1781 	newtp		      = tcp_sk(newsk);
1782 	newinet		      = inet_sk(newsk);
1783 	ireq		      = inet_rsk(req);
1784 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1785 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1786 	newinet->mc_index     = inet_iif(skb);
1787 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1788 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1789 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1790 	if (inet_opt)
1791 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1792 	atomic_set(&newinet->inet_id, get_random_u16());
1793 
1794 	/* Set ToS of the new socket based upon the value of incoming SYN.
1795 	 * ECT bits are set later in tcp_init_transfer().
1796 	 */
1797 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1798 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1799 
1800 	if (!dst) {
1801 		dst = inet_csk_route_child_sock(sk, newsk, req);
1802 		if (!dst)
1803 			goto put_and_exit;
1804 	} else {
1805 		/* syncookie case : see end of cookie_v4_check() */
1806 	}
1807 	sk_setup_caps(newsk, dst);
1808 
1809 	tcp_ca_openreq_child(newsk, dst);
1810 
1811 	tcp_sync_mss(newsk, dst_mtu(dst));
1812 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1813 
1814 	tcp_initialize_rcv_mss(newsk);
1815 
1816 #ifdef CONFIG_TCP_MD5SIG
1817 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1818 	/* Copy over the MD5 key from the original socket */
1819 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1820 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1821 	if (key && !tcp_rsk_used_ao(req)) {
1822 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1823 			goto put_and_exit;
1824 		sk_gso_disable(newsk);
1825 	}
1826 #endif
1827 #ifdef CONFIG_TCP_AO
1828 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1829 		goto put_and_exit; /* OOM, release back memory */
1830 #endif
1831 
1832 	if (__inet_inherit_port(sk, newsk) < 0)
1833 		goto put_and_exit;
1834 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1835 				       &found_dup_sk);
1836 	if (likely(*own_req)) {
1837 		tcp_move_syn(newtp, req);
1838 		ireq->ireq_opt = NULL;
1839 	} else {
1840 		newinet->inet_opt = NULL;
1841 
1842 		if (!req_unhash && found_dup_sk) {
1843 			/* This code path should only be executed in the
1844 			 * syncookie case only
1845 			 */
1846 			bh_unlock_sock(newsk);
1847 			sock_put(newsk);
1848 			newsk = NULL;
1849 		}
1850 	}
1851 	return newsk;
1852 
1853 exit_overflow:
1854 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1855 exit_nonewsk:
1856 	dst_release(dst);
1857 exit:
1858 	tcp_listendrop(sk);
1859 	return NULL;
1860 put_and_exit:
1861 	newinet->inet_opt = NULL;
1862 	inet_csk_prepare_forced_close(newsk);
1863 	tcp_done(newsk);
1864 	goto exit;
1865 }
1866 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1867 
1868 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1869 {
1870 #ifdef CONFIG_SYN_COOKIES
1871 	const struct tcphdr *th = tcp_hdr(skb);
1872 
1873 	if (!th->syn)
1874 		sk = cookie_v4_check(sk, skb);
1875 #endif
1876 	return sk;
1877 }
1878 
1879 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1880 			 struct tcphdr *th, u32 *cookie)
1881 {
1882 	u16 mss = 0;
1883 #ifdef CONFIG_SYN_COOKIES
1884 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1885 				    &tcp_request_sock_ipv4_ops, sk, th);
1886 	if (mss) {
1887 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1888 		tcp_synq_overflow(sk);
1889 	}
1890 #endif
1891 	return mss;
1892 }
1893 
1894 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1895 							   u32));
1896 /* The socket must have it's spinlock held when we get
1897  * here, unless it is a TCP_LISTEN socket.
1898  *
1899  * We have a potential double-lock case here, so even when
1900  * doing backlog processing we use the BH locking scheme.
1901  * This is because we cannot sleep with the original spinlock
1902  * held.
1903  */
1904 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1905 {
1906 	enum skb_drop_reason reason;
1907 	struct sock *rsk;
1908 
1909 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1910 		struct dst_entry *dst;
1911 
1912 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1913 						lockdep_sock_is_held(sk));
1914 
1915 		sock_rps_save_rxhash(sk, skb);
1916 		sk_mark_napi_id(sk, skb);
1917 		if (dst) {
1918 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1919 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1920 					     dst, 0)) {
1921 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1922 				dst_release(dst);
1923 			}
1924 		}
1925 		tcp_rcv_established(sk, skb);
1926 		return 0;
1927 	}
1928 
1929 	if (tcp_checksum_complete(skb))
1930 		goto csum_err;
1931 
1932 	if (sk->sk_state == TCP_LISTEN) {
1933 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1934 
1935 		if (!nsk)
1936 			return 0;
1937 		if (nsk != sk) {
1938 			reason = tcp_child_process(sk, nsk, skb);
1939 			if (reason) {
1940 				rsk = nsk;
1941 				goto reset;
1942 			}
1943 			return 0;
1944 		}
1945 	} else
1946 		sock_rps_save_rxhash(sk, skb);
1947 
1948 	reason = tcp_rcv_state_process(sk, skb);
1949 	if (reason) {
1950 		rsk = sk;
1951 		goto reset;
1952 	}
1953 	return 0;
1954 
1955 reset:
1956 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1957 discard:
1958 	sk_skb_reason_drop(sk, skb, reason);
1959 	/* Be careful here. If this function gets more complicated and
1960 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1961 	 * might be destroyed here. This current version compiles correctly,
1962 	 * but you have been warned.
1963 	 */
1964 	return 0;
1965 
1966 csum_err:
1967 	reason = SKB_DROP_REASON_TCP_CSUM;
1968 	trace_tcp_bad_csum(skb);
1969 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1970 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1971 	goto discard;
1972 }
1973 EXPORT_SYMBOL(tcp_v4_do_rcv);
1974 
1975 int tcp_v4_early_demux(struct sk_buff *skb)
1976 {
1977 	struct net *net = dev_net_rcu(skb->dev);
1978 	const struct iphdr *iph;
1979 	const struct tcphdr *th;
1980 	struct sock *sk;
1981 
1982 	if (skb->pkt_type != PACKET_HOST)
1983 		return 0;
1984 
1985 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1986 		return 0;
1987 
1988 	iph = ip_hdr(skb);
1989 	th = tcp_hdr(skb);
1990 
1991 	if (th->doff < sizeof(struct tcphdr) / 4)
1992 		return 0;
1993 
1994 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1995 				       iph->saddr, th->source,
1996 				       iph->daddr, ntohs(th->dest),
1997 				       skb->skb_iif, inet_sdif(skb));
1998 	if (sk) {
1999 		skb->sk = sk;
2000 		skb->destructor = sock_edemux;
2001 		if (sk_fullsock(sk)) {
2002 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2003 
2004 			if (dst)
2005 				dst = dst_check(dst, 0);
2006 			if (dst &&
2007 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
2008 				skb_dst_set_noref(skb, dst);
2009 		}
2010 	}
2011 	return 0;
2012 }
2013 
2014 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2015 		     enum skb_drop_reason *reason)
2016 {
2017 	u32 tail_gso_size, tail_gso_segs;
2018 	struct skb_shared_info *shinfo;
2019 	const struct tcphdr *th;
2020 	struct tcphdr *thtail;
2021 	struct sk_buff *tail;
2022 	unsigned int hdrlen;
2023 	bool fragstolen;
2024 	u32 gso_segs;
2025 	u32 gso_size;
2026 	u64 limit;
2027 	int delta;
2028 
2029 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2030 	 * we can fix skb->truesize to its real value to avoid future drops.
2031 	 * This is valid because skb is not yet charged to the socket.
2032 	 * It has been noticed pure SACK packets were sometimes dropped
2033 	 * (if cooked by drivers without copybreak feature).
2034 	 */
2035 	skb_condense(skb);
2036 
2037 	tcp_cleanup_skb(skb);
2038 
2039 	if (unlikely(tcp_checksum_complete(skb))) {
2040 		bh_unlock_sock(sk);
2041 		trace_tcp_bad_csum(skb);
2042 		*reason = SKB_DROP_REASON_TCP_CSUM;
2043 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2044 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2045 		return true;
2046 	}
2047 
2048 	/* Attempt coalescing to last skb in backlog, even if we are
2049 	 * above the limits.
2050 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2051 	 */
2052 	th = (const struct tcphdr *)skb->data;
2053 	hdrlen = th->doff * 4;
2054 
2055 	tail = sk->sk_backlog.tail;
2056 	if (!tail)
2057 		goto no_coalesce;
2058 	thtail = (struct tcphdr *)tail->data;
2059 
2060 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2061 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2062 	    ((TCP_SKB_CB(tail)->tcp_flags |
2063 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2064 	    !((TCP_SKB_CB(tail)->tcp_flags &
2065 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2066 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2067 	      TCP_SKB_CB(skb)->tcp_flags) &
2068 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2069 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2070 	    thtail->doff != th->doff ||
2071 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2072 		goto no_coalesce;
2073 
2074 	__skb_pull(skb, hdrlen);
2075 
2076 	shinfo = skb_shinfo(skb);
2077 	gso_size = shinfo->gso_size ?: skb->len;
2078 	gso_segs = shinfo->gso_segs ?: 1;
2079 
2080 	shinfo = skb_shinfo(tail);
2081 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2082 	tail_gso_segs = shinfo->gso_segs ?: 1;
2083 
2084 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2085 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2086 
2087 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2088 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2089 			thtail->window = th->window;
2090 		}
2091 
2092 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2093 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2094 		 * is not entered if we append a packet with a FIN.
2095 		 * SYN, RST, URG are not present.
2096 		 * ACK is set on both packets.
2097 		 * PSH : we do not really care in TCP stack,
2098 		 *       at least for 'GRO' packets.
2099 		 */
2100 		thtail->fin |= th->fin;
2101 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2102 
2103 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2104 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2105 			tail->tstamp = skb->tstamp;
2106 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2107 		}
2108 
2109 		/* Not as strict as GRO. We only need to carry mss max value */
2110 		shinfo->gso_size = max(gso_size, tail_gso_size);
2111 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2112 
2113 		sk->sk_backlog.len += delta;
2114 		__NET_INC_STATS(sock_net(sk),
2115 				LINUX_MIB_TCPBACKLOGCOALESCE);
2116 		kfree_skb_partial(skb, fragstolen);
2117 		return false;
2118 	}
2119 	__skb_push(skb, hdrlen);
2120 
2121 no_coalesce:
2122 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2123 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2124 	 * sk_rcvbuf in normal conditions.
2125 	 */
2126 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2127 
2128 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2129 
2130 	/* Only socket owner can try to collapse/prune rx queues
2131 	 * to reduce memory overhead, so add a little headroom here.
2132 	 * Few sockets backlog are possibly concurrently non empty.
2133 	 */
2134 	limit += 64 * 1024;
2135 
2136 	limit = min_t(u64, limit, UINT_MAX);
2137 
2138 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2139 		bh_unlock_sock(sk);
2140 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2141 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2142 		return true;
2143 	}
2144 	return false;
2145 }
2146 EXPORT_IPV6_MOD(tcp_add_backlog);
2147 
2148 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2149 {
2150 	struct tcphdr *th = (struct tcphdr *)skb->data;
2151 
2152 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2153 }
2154 EXPORT_IPV6_MOD(tcp_filter);
2155 
2156 static void tcp_v4_restore_cb(struct sk_buff *skb)
2157 {
2158 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2159 		sizeof(struct inet_skb_parm));
2160 }
2161 
2162 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2163 			   const struct tcphdr *th)
2164 {
2165 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2166 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2167 	 */
2168 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2169 		sizeof(struct inet_skb_parm));
2170 	barrier();
2171 
2172 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2173 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2174 				    skb->len - th->doff * 4);
2175 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2176 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2177 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2178 	TCP_SKB_CB(skb)->sacked	 = 0;
2179 	TCP_SKB_CB(skb)->has_rxtstamp =
2180 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2181 }
2182 
2183 /*
2184  *	From tcp_input.c
2185  */
2186 
2187 int tcp_v4_rcv(struct sk_buff *skb)
2188 {
2189 	struct net *net = dev_net_rcu(skb->dev);
2190 	enum skb_drop_reason drop_reason;
2191 	enum tcp_tw_status tw_status;
2192 	int sdif = inet_sdif(skb);
2193 	int dif = inet_iif(skb);
2194 	const struct iphdr *iph;
2195 	const struct tcphdr *th;
2196 	struct sock *sk = NULL;
2197 	bool refcounted;
2198 	int ret;
2199 	u32 isn;
2200 
2201 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2202 	if (skb->pkt_type != PACKET_HOST)
2203 		goto discard_it;
2204 
2205 	/* Count it even if it's bad */
2206 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2207 
2208 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2209 		goto discard_it;
2210 
2211 	th = (const struct tcphdr *)skb->data;
2212 
2213 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2214 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2215 		goto bad_packet;
2216 	}
2217 	if (!pskb_may_pull(skb, th->doff * 4))
2218 		goto discard_it;
2219 
2220 	/* An explanation is required here, I think.
2221 	 * Packet length and doff are validated by header prediction,
2222 	 * provided case of th->doff==0 is eliminated.
2223 	 * So, we defer the checks. */
2224 
2225 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2226 		goto csum_error;
2227 
2228 	th = (const struct tcphdr *)skb->data;
2229 	iph = ip_hdr(skb);
2230 lookup:
2231 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2232 			       skb, __tcp_hdrlen(th), th->source,
2233 			       th->dest, sdif, &refcounted);
2234 	if (!sk)
2235 		goto no_tcp_socket;
2236 
2237 	if (sk->sk_state == TCP_TIME_WAIT)
2238 		goto do_time_wait;
2239 
2240 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2241 		struct request_sock *req = inet_reqsk(sk);
2242 		bool req_stolen = false;
2243 		struct sock *nsk;
2244 
2245 		sk = req->rsk_listener;
2246 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2247 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2248 		else
2249 			drop_reason = tcp_inbound_hash(sk, req, skb,
2250 						       &iph->saddr, &iph->daddr,
2251 						       AF_INET, dif, sdif);
2252 		if (unlikely(drop_reason)) {
2253 			sk_drops_add(sk, skb);
2254 			reqsk_put(req);
2255 			goto discard_it;
2256 		}
2257 		if (tcp_checksum_complete(skb)) {
2258 			reqsk_put(req);
2259 			goto csum_error;
2260 		}
2261 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2262 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2263 			if (!nsk) {
2264 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2265 				goto lookup;
2266 			}
2267 			sk = nsk;
2268 			/* reuseport_migrate_sock() has already held one sk_refcnt
2269 			 * before returning.
2270 			 */
2271 		} else {
2272 			/* We own a reference on the listener, increase it again
2273 			 * as we might lose it too soon.
2274 			 */
2275 			sock_hold(sk);
2276 		}
2277 		refcounted = true;
2278 		nsk = NULL;
2279 		if (!tcp_filter(sk, skb)) {
2280 			th = (const struct tcphdr *)skb->data;
2281 			iph = ip_hdr(skb);
2282 			tcp_v4_fill_cb(skb, iph, th);
2283 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2284 					    &drop_reason);
2285 		} else {
2286 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2287 		}
2288 		if (!nsk) {
2289 			reqsk_put(req);
2290 			if (req_stolen) {
2291 				/* Another cpu got exclusive access to req
2292 				 * and created a full blown socket.
2293 				 * Try to feed this packet to this socket
2294 				 * instead of discarding it.
2295 				 */
2296 				tcp_v4_restore_cb(skb);
2297 				sock_put(sk);
2298 				goto lookup;
2299 			}
2300 			goto discard_and_relse;
2301 		}
2302 		nf_reset_ct(skb);
2303 		if (nsk == sk) {
2304 			reqsk_put(req);
2305 			tcp_v4_restore_cb(skb);
2306 		} else {
2307 			drop_reason = tcp_child_process(sk, nsk, skb);
2308 			if (drop_reason) {
2309 				enum sk_rst_reason rst_reason;
2310 
2311 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2312 				tcp_v4_send_reset(nsk, skb, rst_reason);
2313 				goto discard_and_relse;
2314 			}
2315 			sock_put(sk);
2316 			return 0;
2317 		}
2318 	}
2319 
2320 process:
2321 	if (static_branch_unlikely(&ip4_min_ttl)) {
2322 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2323 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2324 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2325 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2326 			goto discard_and_relse;
2327 		}
2328 	}
2329 
2330 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2331 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2332 		goto discard_and_relse;
2333 	}
2334 
2335 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2336 				       AF_INET, dif, sdif);
2337 	if (drop_reason)
2338 		goto discard_and_relse;
2339 
2340 	nf_reset_ct(skb);
2341 
2342 	if (tcp_filter(sk, skb)) {
2343 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2344 		goto discard_and_relse;
2345 	}
2346 	th = (const struct tcphdr *)skb->data;
2347 	iph = ip_hdr(skb);
2348 	tcp_v4_fill_cb(skb, iph, th);
2349 
2350 	skb->dev = NULL;
2351 
2352 	if (sk->sk_state == TCP_LISTEN) {
2353 		ret = tcp_v4_do_rcv(sk, skb);
2354 		goto put_and_return;
2355 	}
2356 
2357 	sk_incoming_cpu_update(sk);
2358 
2359 	bh_lock_sock_nested(sk);
2360 	tcp_segs_in(tcp_sk(sk), skb);
2361 	ret = 0;
2362 	if (!sock_owned_by_user(sk)) {
2363 		ret = tcp_v4_do_rcv(sk, skb);
2364 	} else {
2365 		if (tcp_add_backlog(sk, skb, &drop_reason))
2366 			goto discard_and_relse;
2367 	}
2368 	bh_unlock_sock(sk);
2369 
2370 put_and_return:
2371 	if (refcounted)
2372 		sock_put(sk);
2373 
2374 	return ret;
2375 
2376 no_tcp_socket:
2377 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2378 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2379 		goto discard_it;
2380 
2381 	tcp_v4_fill_cb(skb, iph, th);
2382 
2383 	if (tcp_checksum_complete(skb)) {
2384 csum_error:
2385 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2386 		trace_tcp_bad_csum(skb);
2387 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2388 bad_packet:
2389 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2390 	} else {
2391 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2392 	}
2393 
2394 discard_it:
2395 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2396 	/* Discard frame. */
2397 	sk_skb_reason_drop(sk, skb, drop_reason);
2398 	return 0;
2399 
2400 discard_and_relse:
2401 	sk_drops_add(sk, skb);
2402 	if (refcounted)
2403 		sock_put(sk);
2404 	goto discard_it;
2405 
2406 do_time_wait:
2407 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2408 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2409 		inet_twsk_put(inet_twsk(sk));
2410 		goto discard_it;
2411 	}
2412 
2413 	tcp_v4_fill_cb(skb, iph, th);
2414 
2415 	if (tcp_checksum_complete(skb)) {
2416 		inet_twsk_put(inet_twsk(sk));
2417 		goto csum_error;
2418 	}
2419 
2420 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn);
2421 	switch (tw_status) {
2422 	case TCP_TW_SYN: {
2423 		struct sock *sk2 = inet_lookup_listener(net,
2424 							net->ipv4.tcp_death_row.hashinfo,
2425 							skb, __tcp_hdrlen(th),
2426 							iph->saddr, th->source,
2427 							iph->daddr, th->dest,
2428 							inet_iif(skb),
2429 							sdif);
2430 		if (sk2) {
2431 			inet_twsk_deschedule_put(inet_twsk(sk));
2432 			sk = sk2;
2433 			tcp_v4_restore_cb(skb);
2434 			refcounted = false;
2435 			__this_cpu_write(tcp_tw_isn, isn);
2436 			goto process;
2437 		}
2438 	}
2439 		/* to ACK */
2440 		fallthrough;
2441 	case TCP_TW_ACK:
2442 	case TCP_TW_ACK_OOW:
2443 		tcp_v4_timewait_ack(sk, skb, tw_status);
2444 		break;
2445 	case TCP_TW_RST:
2446 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2447 		inet_twsk_deschedule_put(inet_twsk(sk));
2448 		goto discard_it;
2449 	case TCP_TW_SUCCESS:;
2450 	}
2451 	goto discard_it;
2452 }
2453 
2454 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2455 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2456 	.twsk_destructor= tcp_twsk_destructor,
2457 };
2458 
2459 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2460 {
2461 	struct dst_entry *dst = skb_dst(skb);
2462 
2463 	if (dst && dst_hold_safe(dst)) {
2464 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2465 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2466 	}
2467 }
2468 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2469 
2470 const struct inet_connection_sock_af_ops ipv4_specific = {
2471 	.queue_xmit	   = ip_queue_xmit,
2472 	.send_check	   = tcp_v4_send_check,
2473 	.rebuild_header	   = inet_sk_rebuild_header,
2474 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2475 	.conn_request	   = tcp_v4_conn_request,
2476 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2477 	.net_header_len	   = sizeof(struct iphdr),
2478 	.setsockopt	   = ip_setsockopt,
2479 	.getsockopt	   = ip_getsockopt,
2480 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2481 };
2482 EXPORT_IPV6_MOD(ipv4_specific);
2483 
2484 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2485 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2486 #ifdef CONFIG_TCP_MD5SIG
2487 	.md5_lookup		= tcp_v4_md5_lookup,
2488 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2489 	.md5_parse		= tcp_v4_parse_md5_keys,
2490 #endif
2491 #ifdef CONFIG_TCP_AO
2492 	.ao_lookup		= tcp_v4_ao_lookup,
2493 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2494 	.ao_parse		= tcp_v4_parse_ao,
2495 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2496 #endif
2497 };
2498 #endif
2499 
2500 /* NOTE: A lot of things set to zero explicitly by call to
2501  *       sk_alloc() so need not be done here.
2502  */
2503 static int tcp_v4_init_sock(struct sock *sk)
2504 {
2505 	struct inet_connection_sock *icsk = inet_csk(sk);
2506 
2507 	tcp_init_sock(sk);
2508 
2509 	icsk->icsk_af_ops = &ipv4_specific;
2510 
2511 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2512 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2513 #endif
2514 
2515 	return 0;
2516 }
2517 
2518 #ifdef CONFIG_TCP_MD5SIG
2519 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2520 {
2521 	struct tcp_md5sig_info *md5sig;
2522 
2523 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2524 	kfree(md5sig);
2525 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2526 	tcp_md5_release_sigpool();
2527 }
2528 #endif
2529 
2530 static void tcp_release_user_frags(struct sock *sk)
2531 {
2532 #ifdef CONFIG_PAGE_POOL
2533 	unsigned long index;
2534 	void *netmem;
2535 
2536 	xa_for_each(&sk->sk_user_frags, index, netmem)
2537 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2538 #endif
2539 }
2540 
2541 void tcp_v4_destroy_sock(struct sock *sk)
2542 {
2543 	struct tcp_sock *tp = tcp_sk(sk);
2544 
2545 	tcp_release_user_frags(sk);
2546 
2547 	xa_destroy(&sk->sk_user_frags);
2548 
2549 	trace_tcp_destroy_sock(sk);
2550 
2551 	tcp_clear_xmit_timers(sk);
2552 
2553 	tcp_cleanup_congestion_control(sk);
2554 
2555 	tcp_cleanup_ulp(sk);
2556 
2557 	/* Cleanup up the write buffer. */
2558 	tcp_write_queue_purge(sk);
2559 
2560 	/* Check if we want to disable active TFO */
2561 	tcp_fastopen_active_disable_ofo_check(sk);
2562 
2563 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2564 	skb_rbtree_purge(&tp->out_of_order_queue);
2565 
2566 #ifdef CONFIG_TCP_MD5SIG
2567 	/* Clean up the MD5 key list, if any */
2568 	if (tp->md5sig_info) {
2569 		struct tcp_md5sig_info *md5sig;
2570 
2571 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2572 		tcp_clear_md5_list(sk);
2573 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2574 		rcu_assign_pointer(tp->md5sig_info, NULL);
2575 	}
2576 #endif
2577 	tcp_ao_destroy_sock(sk, false);
2578 
2579 	/* Clean up a referenced TCP bind bucket. */
2580 	if (inet_csk(sk)->icsk_bind_hash)
2581 		inet_put_port(sk);
2582 
2583 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2584 
2585 	/* If socket is aborted during connect operation */
2586 	tcp_free_fastopen_req(tp);
2587 	tcp_fastopen_destroy_cipher(sk);
2588 	tcp_saved_syn_free(tp);
2589 
2590 	sk_sockets_allocated_dec(sk);
2591 }
2592 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2593 
2594 #ifdef CONFIG_PROC_FS
2595 /* Proc filesystem TCP sock list dumping. */
2596 
2597 static unsigned short seq_file_family(const struct seq_file *seq);
2598 
2599 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2600 {
2601 	unsigned short family = seq_file_family(seq);
2602 
2603 	/* AF_UNSPEC is used as a match all */
2604 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2605 		net_eq(sock_net(sk), seq_file_net(seq)));
2606 }
2607 
2608 /* Find a non empty bucket (starting from st->bucket)
2609  * and return the first sk from it.
2610  */
2611 static void *listening_get_first(struct seq_file *seq)
2612 {
2613 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2614 	struct tcp_iter_state *st = seq->private;
2615 
2616 	st->offset = 0;
2617 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2618 		struct inet_listen_hashbucket *ilb2;
2619 		struct hlist_nulls_node *node;
2620 		struct sock *sk;
2621 
2622 		ilb2 = &hinfo->lhash2[st->bucket];
2623 		if (hlist_nulls_empty(&ilb2->nulls_head))
2624 			continue;
2625 
2626 		spin_lock(&ilb2->lock);
2627 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2628 			if (seq_sk_match(seq, sk))
2629 				return sk;
2630 		}
2631 		spin_unlock(&ilb2->lock);
2632 	}
2633 
2634 	return NULL;
2635 }
2636 
2637 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2638  * If "cur" is the last one in the st->bucket,
2639  * call listening_get_first() to return the first sk of the next
2640  * non empty bucket.
2641  */
2642 static void *listening_get_next(struct seq_file *seq, void *cur)
2643 {
2644 	struct tcp_iter_state *st = seq->private;
2645 	struct inet_listen_hashbucket *ilb2;
2646 	struct hlist_nulls_node *node;
2647 	struct inet_hashinfo *hinfo;
2648 	struct sock *sk = cur;
2649 
2650 	++st->num;
2651 	++st->offset;
2652 
2653 	sk = sk_nulls_next(sk);
2654 	sk_nulls_for_each_from(sk, node) {
2655 		if (seq_sk_match(seq, sk))
2656 			return sk;
2657 	}
2658 
2659 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2660 	ilb2 = &hinfo->lhash2[st->bucket];
2661 	spin_unlock(&ilb2->lock);
2662 	++st->bucket;
2663 	return listening_get_first(seq);
2664 }
2665 
2666 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2667 {
2668 	struct tcp_iter_state *st = seq->private;
2669 	void *rc;
2670 
2671 	st->bucket = 0;
2672 	st->offset = 0;
2673 	rc = listening_get_first(seq);
2674 
2675 	while (rc && *pos) {
2676 		rc = listening_get_next(seq, rc);
2677 		--*pos;
2678 	}
2679 	return rc;
2680 }
2681 
2682 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2683 				const struct tcp_iter_state *st)
2684 {
2685 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2686 }
2687 
2688 /*
2689  * Get first established socket starting from bucket given in st->bucket.
2690  * If st->bucket is zero, the very first socket in the hash is returned.
2691  */
2692 static void *established_get_first(struct seq_file *seq)
2693 {
2694 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2695 	struct tcp_iter_state *st = seq->private;
2696 
2697 	st->offset = 0;
2698 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2699 		struct sock *sk;
2700 		struct hlist_nulls_node *node;
2701 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2702 
2703 		cond_resched();
2704 
2705 		/* Lockless fast path for the common case of empty buckets */
2706 		if (empty_bucket(hinfo, st))
2707 			continue;
2708 
2709 		spin_lock_bh(lock);
2710 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2711 			if (seq_sk_match(seq, sk))
2712 				return sk;
2713 		}
2714 		spin_unlock_bh(lock);
2715 	}
2716 
2717 	return NULL;
2718 }
2719 
2720 static void *established_get_next(struct seq_file *seq, void *cur)
2721 {
2722 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2723 	struct tcp_iter_state *st = seq->private;
2724 	struct hlist_nulls_node *node;
2725 	struct sock *sk = cur;
2726 
2727 	++st->num;
2728 	++st->offset;
2729 
2730 	sk = sk_nulls_next(sk);
2731 
2732 	sk_nulls_for_each_from(sk, node) {
2733 		if (seq_sk_match(seq, sk))
2734 			return sk;
2735 	}
2736 
2737 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2738 	++st->bucket;
2739 	return established_get_first(seq);
2740 }
2741 
2742 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2743 {
2744 	struct tcp_iter_state *st = seq->private;
2745 	void *rc;
2746 
2747 	st->bucket = 0;
2748 	rc = established_get_first(seq);
2749 
2750 	while (rc && pos) {
2751 		rc = established_get_next(seq, rc);
2752 		--pos;
2753 	}
2754 	return rc;
2755 }
2756 
2757 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2758 {
2759 	void *rc;
2760 	struct tcp_iter_state *st = seq->private;
2761 
2762 	st->state = TCP_SEQ_STATE_LISTENING;
2763 	rc	  = listening_get_idx(seq, &pos);
2764 
2765 	if (!rc) {
2766 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2767 		rc	  = established_get_idx(seq, pos);
2768 	}
2769 
2770 	return rc;
2771 }
2772 
2773 static void *tcp_seek_last_pos(struct seq_file *seq)
2774 {
2775 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2776 	struct tcp_iter_state *st = seq->private;
2777 	int bucket = st->bucket;
2778 	int offset = st->offset;
2779 	int orig_num = st->num;
2780 	void *rc = NULL;
2781 
2782 	switch (st->state) {
2783 	case TCP_SEQ_STATE_LISTENING:
2784 		if (st->bucket > hinfo->lhash2_mask)
2785 			break;
2786 		rc = listening_get_first(seq);
2787 		while (offset-- && rc && bucket == st->bucket)
2788 			rc = listening_get_next(seq, rc);
2789 		if (rc)
2790 			break;
2791 		st->bucket = 0;
2792 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2793 		fallthrough;
2794 	case TCP_SEQ_STATE_ESTABLISHED:
2795 		if (st->bucket > hinfo->ehash_mask)
2796 			break;
2797 		rc = established_get_first(seq);
2798 		while (offset-- && rc && bucket == st->bucket)
2799 			rc = established_get_next(seq, rc);
2800 	}
2801 
2802 	st->num = orig_num;
2803 
2804 	return rc;
2805 }
2806 
2807 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2808 {
2809 	struct tcp_iter_state *st = seq->private;
2810 	void *rc;
2811 
2812 	if (*pos && *pos == st->last_pos) {
2813 		rc = tcp_seek_last_pos(seq);
2814 		if (rc)
2815 			goto out;
2816 	}
2817 
2818 	st->state = TCP_SEQ_STATE_LISTENING;
2819 	st->num = 0;
2820 	st->bucket = 0;
2821 	st->offset = 0;
2822 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2823 
2824 out:
2825 	st->last_pos = *pos;
2826 	return rc;
2827 }
2828 EXPORT_IPV6_MOD(tcp_seq_start);
2829 
2830 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2831 {
2832 	struct tcp_iter_state *st = seq->private;
2833 	void *rc = NULL;
2834 
2835 	if (v == SEQ_START_TOKEN) {
2836 		rc = tcp_get_idx(seq, 0);
2837 		goto out;
2838 	}
2839 
2840 	switch (st->state) {
2841 	case TCP_SEQ_STATE_LISTENING:
2842 		rc = listening_get_next(seq, v);
2843 		if (!rc) {
2844 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2845 			st->bucket = 0;
2846 			st->offset = 0;
2847 			rc	  = established_get_first(seq);
2848 		}
2849 		break;
2850 	case TCP_SEQ_STATE_ESTABLISHED:
2851 		rc = established_get_next(seq, v);
2852 		break;
2853 	}
2854 out:
2855 	++*pos;
2856 	st->last_pos = *pos;
2857 	return rc;
2858 }
2859 EXPORT_IPV6_MOD(tcp_seq_next);
2860 
2861 void tcp_seq_stop(struct seq_file *seq, void *v)
2862 {
2863 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2864 	struct tcp_iter_state *st = seq->private;
2865 
2866 	switch (st->state) {
2867 	case TCP_SEQ_STATE_LISTENING:
2868 		if (v != SEQ_START_TOKEN)
2869 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2870 		break;
2871 	case TCP_SEQ_STATE_ESTABLISHED:
2872 		if (v)
2873 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2874 		break;
2875 	}
2876 }
2877 EXPORT_IPV6_MOD(tcp_seq_stop);
2878 
2879 static void get_openreq4(const struct request_sock *req,
2880 			 struct seq_file *f, int i)
2881 {
2882 	const struct inet_request_sock *ireq = inet_rsk(req);
2883 	long delta = req->rsk_timer.expires - jiffies;
2884 
2885 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2886 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2887 		i,
2888 		ireq->ir_loc_addr,
2889 		ireq->ir_num,
2890 		ireq->ir_rmt_addr,
2891 		ntohs(ireq->ir_rmt_port),
2892 		TCP_SYN_RECV,
2893 		0, 0, /* could print option size, but that is af dependent. */
2894 		1,    /* timers active (only the expire timer) */
2895 		jiffies_delta_to_clock_t(delta),
2896 		req->num_timeout,
2897 		from_kuid_munged(seq_user_ns(f),
2898 				 sock_i_uid(req->rsk_listener)),
2899 		0,  /* non standard timer */
2900 		0, /* open_requests have no inode */
2901 		0,
2902 		req);
2903 }
2904 
2905 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2906 {
2907 	int timer_active;
2908 	unsigned long timer_expires;
2909 	const struct tcp_sock *tp = tcp_sk(sk);
2910 	const struct inet_connection_sock *icsk = inet_csk(sk);
2911 	const struct inet_sock *inet = inet_sk(sk);
2912 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2913 	__be32 dest = inet->inet_daddr;
2914 	__be32 src = inet->inet_rcv_saddr;
2915 	__u16 destp = ntohs(inet->inet_dport);
2916 	__u16 srcp = ntohs(inet->inet_sport);
2917 	u8 icsk_pending;
2918 	int rx_queue;
2919 	int state;
2920 
2921 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2922 	if (icsk_pending == ICSK_TIME_RETRANS ||
2923 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2924 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2925 		timer_active	= 1;
2926 		timer_expires	= icsk_timeout(icsk);
2927 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2928 		timer_active	= 4;
2929 		timer_expires	= icsk_timeout(icsk);
2930 	} else if (timer_pending(&sk->sk_timer)) {
2931 		timer_active	= 2;
2932 		timer_expires	= sk->sk_timer.expires;
2933 	} else {
2934 		timer_active	= 0;
2935 		timer_expires = jiffies;
2936 	}
2937 
2938 	state = inet_sk_state_load(sk);
2939 	if (state == TCP_LISTEN)
2940 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2941 	else
2942 		/* Because we don't lock the socket,
2943 		 * we might find a transient negative value.
2944 		 */
2945 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2946 				      READ_ONCE(tp->copied_seq), 0);
2947 
2948 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2949 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2950 		i, src, srcp, dest, destp, state,
2951 		READ_ONCE(tp->write_seq) - tp->snd_una,
2952 		rx_queue,
2953 		timer_active,
2954 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2955 		icsk->icsk_retransmits,
2956 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2957 		icsk->icsk_probes_out,
2958 		sock_i_ino(sk),
2959 		refcount_read(&sk->sk_refcnt), sk,
2960 		jiffies_to_clock_t(icsk->icsk_rto),
2961 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2962 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2963 		tcp_snd_cwnd(tp),
2964 		state == TCP_LISTEN ?
2965 		    fastopenq->max_qlen :
2966 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2967 }
2968 
2969 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2970 			       struct seq_file *f, int i)
2971 {
2972 	long delta = tw->tw_timer.expires - jiffies;
2973 	__be32 dest, src;
2974 	__u16 destp, srcp;
2975 
2976 	dest  = tw->tw_daddr;
2977 	src   = tw->tw_rcv_saddr;
2978 	destp = ntohs(tw->tw_dport);
2979 	srcp  = ntohs(tw->tw_sport);
2980 
2981 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2982 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2983 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2984 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2985 		refcount_read(&tw->tw_refcnt), tw);
2986 }
2987 
2988 #define TMPSZ 150
2989 
2990 static int tcp4_seq_show(struct seq_file *seq, void *v)
2991 {
2992 	struct tcp_iter_state *st;
2993 	struct sock *sk = v;
2994 
2995 	seq_setwidth(seq, TMPSZ - 1);
2996 	if (v == SEQ_START_TOKEN) {
2997 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2998 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2999 			   "inode");
3000 		goto out;
3001 	}
3002 	st = seq->private;
3003 
3004 	if (sk->sk_state == TCP_TIME_WAIT)
3005 		get_timewait4_sock(v, seq, st->num);
3006 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
3007 		get_openreq4(v, seq, st->num);
3008 	else
3009 		get_tcp4_sock(v, seq, st->num);
3010 out:
3011 	seq_pad(seq, '\n');
3012 	return 0;
3013 }
3014 
3015 #ifdef CONFIG_BPF_SYSCALL
3016 struct bpf_tcp_iter_state {
3017 	struct tcp_iter_state state;
3018 	unsigned int cur_sk;
3019 	unsigned int end_sk;
3020 	unsigned int max_sk;
3021 	struct sock **batch;
3022 	bool st_bucket_done;
3023 };
3024 
3025 struct bpf_iter__tcp {
3026 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3027 	__bpf_md_ptr(struct sock_common *, sk_common);
3028 	uid_t uid __aligned(8);
3029 };
3030 
3031 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3032 			     struct sock_common *sk_common, uid_t uid)
3033 {
3034 	struct bpf_iter__tcp ctx;
3035 
3036 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3037 	ctx.meta = meta;
3038 	ctx.sk_common = sk_common;
3039 	ctx.uid = uid;
3040 	return bpf_iter_run_prog(prog, &ctx);
3041 }
3042 
3043 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3044 {
3045 	while (iter->cur_sk < iter->end_sk)
3046 		sock_gen_put(iter->batch[iter->cur_sk++]);
3047 }
3048 
3049 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3050 				      unsigned int new_batch_sz)
3051 {
3052 	struct sock **new_batch;
3053 
3054 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3055 			     GFP_USER | __GFP_NOWARN);
3056 	if (!new_batch)
3057 		return -ENOMEM;
3058 
3059 	bpf_iter_tcp_put_batch(iter);
3060 	kvfree(iter->batch);
3061 	iter->batch = new_batch;
3062 	iter->max_sk = new_batch_sz;
3063 
3064 	return 0;
3065 }
3066 
3067 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3068 						 struct sock *start_sk)
3069 {
3070 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3071 	struct bpf_tcp_iter_state *iter = seq->private;
3072 	struct tcp_iter_state *st = &iter->state;
3073 	struct hlist_nulls_node *node;
3074 	unsigned int expected = 1;
3075 	struct sock *sk;
3076 
3077 	sock_hold(start_sk);
3078 	iter->batch[iter->end_sk++] = start_sk;
3079 
3080 	sk = sk_nulls_next(start_sk);
3081 	sk_nulls_for_each_from(sk, node) {
3082 		if (seq_sk_match(seq, sk)) {
3083 			if (iter->end_sk < iter->max_sk) {
3084 				sock_hold(sk);
3085 				iter->batch[iter->end_sk++] = sk;
3086 			}
3087 			expected++;
3088 		}
3089 	}
3090 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3091 
3092 	return expected;
3093 }
3094 
3095 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3096 						   struct sock *start_sk)
3097 {
3098 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3099 	struct bpf_tcp_iter_state *iter = seq->private;
3100 	struct tcp_iter_state *st = &iter->state;
3101 	struct hlist_nulls_node *node;
3102 	unsigned int expected = 1;
3103 	struct sock *sk;
3104 
3105 	sock_hold(start_sk);
3106 	iter->batch[iter->end_sk++] = start_sk;
3107 
3108 	sk = sk_nulls_next(start_sk);
3109 	sk_nulls_for_each_from(sk, node) {
3110 		if (seq_sk_match(seq, sk)) {
3111 			if (iter->end_sk < iter->max_sk) {
3112 				sock_hold(sk);
3113 				iter->batch[iter->end_sk++] = sk;
3114 			}
3115 			expected++;
3116 		}
3117 	}
3118 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3119 
3120 	return expected;
3121 }
3122 
3123 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3124 {
3125 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3126 	struct bpf_tcp_iter_state *iter = seq->private;
3127 	struct tcp_iter_state *st = &iter->state;
3128 	unsigned int expected;
3129 	bool resized = false;
3130 	struct sock *sk;
3131 
3132 	/* The st->bucket is done.  Directly advance to the next
3133 	 * bucket instead of having the tcp_seek_last_pos() to skip
3134 	 * one by one in the current bucket and eventually find out
3135 	 * it has to advance to the next bucket.
3136 	 */
3137 	if (iter->st_bucket_done) {
3138 		st->offset = 0;
3139 		st->bucket++;
3140 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3141 		    st->bucket > hinfo->lhash2_mask) {
3142 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3143 			st->bucket = 0;
3144 		}
3145 	}
3146 
3147 again:
3148 	/* Get a new batch */
3149 	iter->cur_sk = 0;
3150 	iter->end_sk = 0;
3151 	iter->st_bucket_done = false;
3152 
3153 	sk = tcp_seek_last_pos(seq);
3154 	if (!sk)
3155 		return NULL; /* Done */
3156 
3157 	if (st->state == TCP_SEQ_STATE_LISTENING)
3158 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3159 	else
3160 		expected = bpf_iter_tcp_established_batch(seq, sk);
3161 
3162 	if (iter->end_sk == expected) {
3163 		iter->st_bucket_done = true;
3164 		return sk;
3165 	}
3166 
3167 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3168 		resized = true;
3169 		goto again;
3170 	}
3171 
3172 	return sk;
3173 }
3174 
3175 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3176 {
3177 	/* bpf iter does not support lseek, so it always
3178 	 * continue from where it was stop()-ped.
3179 	 */
3180 	if (*pos)
3181 		return bpf_iter_tcp_batch(seq);
3182 
3183 	return SEQ_START_TOKEN;
3184 }
3185 
3186 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3187 {
3188 	struct bpf_tcp_iter_state *iter = seq->private;
3189 	struct tcp_iter_state *st = &iter->state;
3190 	struct sock *sk;
3191 
3192 	/* Whenever seq_next() is called, the iter->cur_sk is
3193 	 * done with seq_show(), so advance to the next sk in
3194 	 * the batch.
3195 	 */
3196 	if (iter->cur_sk < iter->end_sk) {
3197 		/* Keeping st->num consistent in tcp_iter_state.
3198 		 * bpf_iter_tcp does not use st->num.
3199 		 * meta.seq_num is used instead.
3200 		 */
3201 		st->num++;
3202 		/* Move st->offset to the next sk in the bucket such that
3203 		 * the future start() will resume at st->offset in
3204 		 * st->bucket.  See tcp_seek_last_pos().
3205 		 */
3206 		st->offset++;
3207 		sock_gen_put(iter->batch[iter->cur_sk++]);
3208 	}
3209 
3210 	if (iter->cur_sk < iter->end_sk)
3211 		sk = iter->batch[iter->cur_sk];
3212 	else
3213 		sk = bpf_iter_tcp_batch(seq);
3214 
3215 	++*pos;
3216 	/* Keeping st->last_pos consistent in tcp_iter_state.
3217 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3218 	 */
3219 	st->last_pos = *pos;
3220 	return sk;
3221 }
3222 
3223 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3224 {
3225 	struct bpf_iter_meta meta;
3226 	struct bpf_prog *prog;
3227 	struct sock *sk = v;
3228 	uid_t uid;
3229 	int ret;
3230 
3231 	if (v == SEQ_START_TOKEN)
3232 		return 0;
3233 
3234 	if (sk_fullsock(sk))
3235 		lock_sock(sk);
3236 
3237 	if (unlikely(sk_unhashed(sk))) {
3238 		ret = SEQ_SKIP;
3239 		goto unlock;
3240 	}
3241 
3242 	if (sk->sk_state == TCP_TIME_WAIT) {
3243 		uid = 0;
3244 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3245 		const struct request_sock *req = v;
3246 
3247 		uid = from_kuid_munged(seq_user_ns(seq),
3248 				       sock_i_uid(req->rsk_listener));
3249 	} else {
3250 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3251 	}
3252 
3253 	meta.seq = seq;
3254 	prog = bpf_iter_get_info(&meta, false);
3255 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3256 
3257 unlock:
3258 	if (sk_fullsock(sk))
3259 		release_sock(sk);
3260 	return ret;
3261 
3262 }
3263 
3264 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3265 {
3266 	struct bpf_tcp_iter_state *iter = seq->private;
3267 	struct bpf_iter_meta meta;
3268 	struct bpf_prog *prog;
3269 
3270 	if (!v) {
3271 		meta.seq = seq;
3272 		prog = bpf_iter_get_info(&meta, true);
3273 		if (prog)
3274 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3275 	}
3276 
3277 	if (iter->cur_sk < iter->end_sk) {
3278 		bpf_iter_tcp_put_batch(iter);
3279 		iter->st_bucket_done = false;
3280 	}
3281 }
3282 
3283 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3284 	.show		= bpf_iter_tcp_seq_show,
3285 	.start		= bpf_iter_tcp_seq_start,
3286 	.next		= bpf_iter_tcp_seq_next,
3287 	.stop		= bpf_iter_tcp_seq_stop,
3288 };
3289 #endif
3290 static unsigned short seq_file_family(const struct seq_file *seq)
3291 {
3292 	const struct tcp_seq_afinfo *afinfo;
3293 
3294 #ifdef CONFIG_BPF_SYSCALL
3295 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3296 	if (seq->op == &bpf_iter_tcp_seq_ops)
3297 		return AF_UNSPEC;
3298 #endif
3299 
3300 	/* Iterated from proc fs */
3301 	afinfo = pde_data(file_inode(seq->file));
3302 	return afinfo->family;
3303 }
3304 
3305 static const struct seq_operations tcp4_seq_ops = {
3306 	.show		= tcp4_seq_show,
3307 	.start		= tcp_seq_start,
3308 	.next		= tcp_seq_next,
3309 	.stop		= tcp_seq_stop,
3310 };
3311 
3312 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3313 	.family		= AF_INET,
3314 };
3315 
3316 static int __net_init tcp4_proc_init_net(struct net *net)
3317 {
3318 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3319 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3320 		return -ENOMEM;
3321 	return 0;
3322 }
3323 
3324 static void __net_exit tcp4_proc_exit_net(struct net *net)
3325 {
3326 	remove_proc_entry("tcp", net->proc_net);
3327 }
3328 
3329 static struct pernet_operations tcp4_net_ops = {
3330 	.init = tcp4_proc_init_net,
3331 	.exit = tcp4_proc_exit_net,
3332 };
3333 
3334 int __init tcp4_proc_init(void)
3335 {
3336 	return register_pernet_subsys(&tcp4_net_ops);
3337 }
3338 
3339 void tcp4_proc_exit(void)
3340 {
3341 	unregister_pernet_subsys(&tcp4_net_ops);
3342 }
3343 #endif /* CONFIG_PROC_FS */
3344 
3345 /* @wake is one when sk_stream_write_space() calls us.
3346  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3347  * This mimics the strategy used in sock_def_write_space().
3348  */
3349 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3350 {
3351 	const struct tcp_sock *tp = tcp_sk(sk);
3352 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3353 			    READ_ONCE(tp->snd_nxt);
3354 
3355 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3356 }
3357 EXPORT_SYMBOL(tcp_stream_memory_free);
3358 
3359 struct proto tcp_prot = {
3360 	.name			= "TCP",
3361 	.owner			= THIS_MODULE,
3362 	.close			= tcp_close,
3363 	.pre_connect		= tcp_v4_pre_connect,
3364 	.connect		= tcp_v4_connect,
3365 	.disconnect		= tcp_disconnect,
3366 	.accept			= inet_csk_accept,
3367 	.ioctl			= tcp_ioctl,
3368 	.init			= tcp_v4_init_sock,
3369 	.destroy		= tcp_v4_destroy_sock,
3370 	.shutdown		= tcp_shutdown,
3371 	.setsockopt		= tcp_setsockopt,
3372 	.getsockopt		= tcp_getsockopt,
3373 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3374 	.keepalive		= tcp_set_keepalive,
3375 	.recvmsg		= tcp_recvmsg,
3376 	.sendmsg		= tcp_sendmsg,
3377 	.splice_eof		= tcp_splice_eof,
3378 	.backlog_rcv		= tcp_v4_do_rcv,
3379 	.release_cb		= tcp_release_cb,
3380 	.hash			= inet_hash,
3381 	.unhash			= inet_unhash,
3382 	.get_port		= inet_csk_get_port,
3383 	.put_port		= inet_put_port,
3384 #ifdef CONFIG_BPF_SYSCALL
3385 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3386 #endif
3387 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3388 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3389 	.stream_memory_free	= tcp_stream_memory_free,
3390 	.sockets_allocated	= &tcp_sockets_allocated,
3391 	.orphan_count		= &tcp_orphan_count,
3392 
3393 	.memory_allocated	= &tcp_memory_allocated,
3394 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3395 
3396 	.memory_pressure	= &tcp_memory_pressure,
3397 	.sysctl_mem		= sysctl_tcp_mem,
3398 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3399 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3400 	.max_header		= MAX_TCP_HEADER,
3401 	.obj_size		= sizeof(struct tcp_sock),
3402 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3403 	.twsk_prot		= &tcp_timewait_sock_ops,
3404 	.rsk_prot		= &tcp_request_sock_ops,
3405 	.h.hashinfo		= NULL,
3406 	.no_autobind		= true,
3407 	.diag_destroy		= tcp_abort,
3408 };
3409 EXPORT_SYMBOL(tcp_prot);
3410 
3411 static void __net_exit tcp_sk_exit(struct net *net)
3412 {
3413 	if (net->ipv4.tcp_congestion_control)
3414 		bpf_module_put(net->ipv4.tcp_congestion_control,
3415 			       net->ipv4.tcp_congestion_control->owner);
3416 }
3417 
3418 static void __net_init tcp_set_hashinfo(struct net *net)
3419 {
3420 	struct inet_hashinfo *hinfo;
3421 	unsigned int ehash_entries;
3422 	struct net *old_net;
3423 
3424 	if (net_eq(net, &init_net))
3425 		goto fallback;
3426 
3427 	old_net = current->nsproxy->net_ns;
3428 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3429 	if (!ehash_entries)
3430 		goto fallback;
3431 
3432 	ehash_entries = roundup_pow_of_two(ehash_entries);
3433 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3434 	if (!hinfo) {
3435 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3436 			"for a netns, fallback to the global one\n",
3437 			ehash_entries);
3438 fallback:
3439 		hinfo = &tcp_hashinfo;
3440 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3441 	}
3442 
3443 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3444 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3445 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3446 }
3447 
3448 static int __net_init tcp_sk_init(struct net *net)
3449 {
3450 	net->ipv4.sysctl_tcp_ecn = 2;
3451 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3452 
3453 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3454 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3455 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3456 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3457 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3458 
3459 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3460 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3461 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3462 
3463 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3464 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3465 	net->ipv4.sysctl_tcp_syncookies = 1;
3466 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3467 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3468 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3469 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3470 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3471 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3472 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3473 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3474 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3475 
3476 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3477 	tcp_set_hashinfo(net);
3478 
3479 	net->ipv4.sysctl_tcp_sack = 1;
3480 	net->ipv4.sysctl_tcp_window_scaling = 1;
3481 	net->ipv4.sysctl_tcp_timestamps = 1;
3482 	net->ipv4.sysctl_tcp_early_retrans = 3;
3483 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3484 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3485 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3486 	net->ipv4.sysctl_tcp_max_reordering = 300;
3487 	net->ipv4.sysctl_tcp_dsack = 1;
3488 	net->ipv4.sysctl_tcp_app_win = 31;
3489 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3490 	net->ipv4.sysctl_tcp_frto = 2;
3491 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3492 	/* This limits the percentage of the congestion window which we
3493 	 * will allow a single TSO frame to consume.  Building TSO frames
3494 	 * which are too large can cause TCP streams to be bursty.
3495 	 */
3496 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3497 	/* Default TSQ limit of 16 TSO segments */
3498 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3499 
3500 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3501 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3502 
3503 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3504 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3505 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3506 	net->ipv4.sysctl_tcp_autocorking = 1;
3507 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3508 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3509 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3510 	if (net != &init_net) {
3511 		memcpy(net->ipv4.sysctl_tcp_rmem,
3512 		       init_net.ipv4.sysctl_tcp_rmem,
3513 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3514 		memcpy(net->ipv4.sysctl_tcp_wmem,
3515 		       init_net.ipv4.sysctl_tcp_wmem,
3516 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3517 	}
3518 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3519 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3520 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3521 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3522 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3523 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3524 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3525 
3526 	/* Set default values for PLB */
3527 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3528 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3529 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3530 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3531 	/* Default congestion threshold for PLB to mark a round is 50% */
3532 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3533 
3534 	/* Reno is always built in */
3535 	if (!net_eq(net, &init_net) &&
3536 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3537 			       init_net.ipv4.tcp_congestion_control->owner))
3538 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3539 	else
3540 		net->ipv4.tcp_congestion_control = &tcp_reno;
3541 
3542 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3543 	net->ipv4.sysctl_tcp_shrink_window = 0;
3544 
3545 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3546 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3547 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3548 
3549 	return 0;
3550 }
3551 
3552 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3553 {
3554 	struct net *net;
3555 
3556 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3557 	 * and failed setup_net error unwinding path are serialized.
3558 	 *
3559 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3560 	 * net_exit_list, the thread that dismantles a particular twsk must
3561 	 * do so without other thread progressing to refcount_dec_and_test() of
3562 	 * tcp_death_row.tw_refcount.
3563 	 */
3564 	mutex_lock(&tcp_exit_batch_mutex);
3565 
3566 	tcp_twsk_purge(net_exit_list);
3567 
3568 	list_for_each_entry(net, net_exit_list, exit_list) {
3569 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3570 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3571 		tcp_fastopen_ctx_destroy(net);
3572 	}
3573 
3574 	mutex_unlock(&tcp_exit_batch_mutex);
3575 }
3576 
3577 static struct pernet_operations __net_initdata tcp_sk_ops = {
3578        .init	   = tcp_sk_init,
3579        .exit	   = tcp_sk_exit,
3580        .exit_batch = tcp_sk_exit_batch,
3581 };
3582 
3583 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3584 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3585 		     struct sock_common *sk_common, uid_t uid)
3586 
3587 #define INIT_BATCH_SZ 16
3588 
3589 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3590 {
3591 	struct bpf_tcp_iter_state *iter = priv_data;
3592 	int err;
3593 
3594 	err = bpf_iter_init_seq_net(priv_data, aux);
3595 	if (err)
3596 		return err;
3597 
3598 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3599 	if (err) {
3600 		bpf_iter_fini_seq_net(priv_data);
3601 		return err;
3602 	}
3603 
3604 	return 0;
3605 }
3606 
3607 static void bpf_iter_fini_tcp(void *priv_data)
3608 {
3609 	struct bpf_tcp_iter_state *iter = priv_data;
3610 
3611 	bpf_iter_fini_seq_net(priv_data);
3612 	kvfree(iter->batch);
3613 }
3614 
3615 static const struct bpf_iter_seq_info tcp_seq_info = {
3616 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3617 	.init_seq_private	= bpf_iter_init_tcp,
3618 	.fini_seq_private	= bpf_iter_fini_tcp,
3619 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3620 };
3621 
3622 static const struct bpf_func_proto *
3623 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3624 			    const struct bpf_prog *prog)
3625 {
3626 	switch (func_id) {
3627 	case BPF_FUNC_setsockopt:
3628 		return &bpf_sk_setsockopt_proto;
3629 	case BPF_FUNC_getsockopt:
3630 		return &bpf_sk_getsockopt_proto;
3631 	default:
3632 		return NULL;
3633 	}
3634 }
3635 
3636 static struct bpf_iter_reg tcp_reg_info = {
3637 	.target			= "tcp",
3638 	.ctx_arg_info_size	= 1,
3639 	.ctx_arg_info		= {
3640 		{ offsetof(struct bpf_iter__tcp, sk_common),
3641 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3642 	},
3643 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3644 	.seq_info		= &tcp_seq_info,
3645 };
3646 
3647 static void __init bpf_iter_register(void)
3648 {
3649 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3650 	if (bpf_iter_reg_target(&tcp_reg_info))
3651 		pr_warn("Warning: could not register bpf iterator tcp\n");
3652 }
3653 
3654 #endif
3655 
3656 void __init tcp_v4_init(void)
3657 {
3658 	int cpu, res;
3659 
3660 	for_each_possible_cpu(cpu) {
3661 		struct sock *sk;
3662 
3663 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3664 					   IPPROTO_TCP, &init_net);
3665 		if (res)
3666 			panic("Failed to create the TCP control socket.\n");
3667 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3668 
3669 		/* Please enforce IP_DF and IPID==0 for RST and
3670 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3671 		 */
3672 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3673 
3674 		sk->sk_clockid = CLOCK_MONOTONIC;
3675 
3676 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3677 	}
3678 	if (register_pernet_subsys(&tcp_sk_ops))
3679 		panic("Failed to create the TCP control socket.\n");
3680 
3681 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3682 	bpf_iter_register();
3683 #endif
3684 }
3685