xref: /linux/net/ipv4/tcp_ipv4.c (revision 00c94ca2b99e6610e483f92e531b319eeaed94aa)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 #include <linux/sock_diag.h>
62 
63 #include <net/aligned_data.h>
64 #include <net/net_namespace.h>
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/tcp_ecn.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/inet_ecn.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/secure_seq.h>
76 #include <net/busy_poll.h>
77 #include <net/rstreason.h>
78 
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84 #include <linux/inetdevice.h>
85 #include <linux/btf_ids.h>
86 #include <linux/skbuff_ref.h>
87 
88 #include <crypto/hash.h>
89 #include <linux/scatterlist.h>
90 
91 #include <trace/events/tcp.h>
92 
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
96 #endif
97 
98 struct inet_hashinfo tcp_hashinfo;
99 
100 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
101 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
102 };
103 
104 static DEFINE_MUTEX(tcp_exit_batch_mutex);
105 
106 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
107 {
108 	return secure_tcp_seq(ip_hdr(skb)->daddr,
109 			      ip_hdr(skb)->saddr,
110 			      tcp_hdr(skb)->dest,
111 			      tcp_hdr(skb)->source);
112 }
113 
114 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
115 {
116 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
117 }
118 
119 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
120 {
121 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
122 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
123 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
124 	struct tcp_sock *tp = tcp_sk(sk);
125 	int ts_recent_stamp;
126 	u32 reuse_thresh;
127 
128 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
129 		reuse = 0;
130 
131 	if (reuse == 2) {
132 		/* Still does not detect *everything* that goes through
133 		 * lo, since we require a loopback src or dst address
134 		 * or direct binding to 'lo' interface.
135 		 */
136 		bool loopback = false;
137 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
138 			loopback = true;
139 #if IS_ENABLED(CONFIG_IPV6)
140 		if (tw->tw_family == AF_INET6) {
141 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
142 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
143 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
144 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
145 				loopback = true;
146 		} else
147 #endif
148 		{
149 			if (ipv4_is_loopback(tw->tw_daddr) ||
150 			    ipv4_is_loopback(tw->tw_rcv_saddr))
151 				loopback = true;
152 		}
153 		if (!loopback)
154 			reuse = 0;
155 	}
156 
157 	/* With PAWS, it is safe from the viewpoint
158 	   of data integrity. Even without PAWS it is safe provided sequence
159 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
160 
161 	   Actually, the idea is close to VJ's one, only timestamp cache is
162 	   held not per host, but per port pair and TW bucket is used as state
163 	   holder.
164 
165 	   If TW bucket has been already destroyed we fall back to VJ's scheme
166 	   and use initial timestamp retrieved from peer table.
167 	 */
168 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
169 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
170 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
171 	if (ts_recent_stamp &&
172 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
173 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
174 		 * and releasing the bucket lock.
175 		 */
176 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
177 			return 0;
178 
179 		/* In case of repair and re-using TIME-WAIT sockets we still
180 		 * want to be sure that it is safe as above but honor the
181 		 * sequence numbers and time stamps set as part of the repair
182 		 * process.
183 		 *
184 		 * Without this check re-using a TIME-WAIT socket with TCP
185 		 * repair would accumulate a -1 on the repair assigned
186 		 * sequence number. The first time it is reused the sequence
187 		 * is -1, the second time -2, etc. This fixes that issue
188 		 * without appearing to create any others.
189 		 */
190 		if (likely(!tp->repair)) {
191 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
192 
193 			if (!seq)
194 				seq = 1;
195 			WRITE_ONCE(tp->write_seq, seq);
196 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
197 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
198 		}
199 
200 		return 1;
201 	}
202 
203 	return 0;
204 }
205 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
206 
207 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
208 			      int addr_len)
209 {
210 	/* This check is replicated from tcp_v4_connect() and intended to
211 	 * prevent BPF program called below from accessing bytes that are out
212 	 * of the bound specified by user in addr_len.
213 	 */
214 	if (addr_len < sizeof(struct sockaddr_in))
215 		return -EINVAL;
216 
217 	sock_owned_by_me(sk);
218 
219 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
220 }
221 
222 /* This will initiate an outgoing connection. */
223 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
224 {
225 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
226 	struct inet_timewait_death_row *tcp_death_row;
227 	struct inet_sock *inet = inet_sk(sk);
228 	struct tcp_sock *tp = tcp_sk(sk);
229 	struct ip_options_rcu *inet_opt;
230 	struct net *net = sock_net(sk);
231 	__be16 orig_sport, orig_dport;
232 	__be32 daddr, nexthop;
233 	struct flowi4 *fl4;
234 	struct rtable *rt;
235 	int err;
236 
237 	if (addr_len < sizeof(struct sockaddr_in))
238 		return -EINVAL;
239 
240 	if (usin->sin_family != AF_INET)
241 		return -EAFNOSUPPORT;
242 
243 	nexthop = daddr = usin->sin_addr.s_addr;
244 	inet_opt = rcu_dereference_protected(inet->inet_opt,
245 					     lockdep_sock_is_held(sk));
246 	if (inet_opt && inet_opt->opt.srr) {
247 		if (!daddr)
248 			return -EINVAL;
249 		nexthop = inet_opt->opt.faddr;
250 	}
251 
252 	orig_sport = inet->inet_sport;
253 	orig_dport = usin->sin_port;
254 	fl4 = &inet->cork.fl.u.ip4;
255 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
256 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
257 			      orig_dport, sk);
258 	if (IS_ERR(rt)) {
259 		err = PTR_ERR(rt);
260 		if (err == -ENETUNREACH)
261 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
262 		return err;
263 	}
264 
265 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
266 		ip_rt_put(rt);
267 		return -ENETUNREACH;
268 	}
269 
270 	if (!inet_opt || !inet_opt->opt.srr)
271 		daddr = fl4->daddr;
272 
273 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
274 
275 	if (!inet->inet_saddr) {
276 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
277 		if (err) {
278 			ip_rt_put(rt);
279 			return err;
280 		}
281 	} else {
282 		sk_rcv_saddr_set(sk, inet->inet_saddr);
283 	}
284 
285 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
286 		/* Reset inherited state */
287 		tp->rx_opt.ts_recent	   = 0;
288 		tp->rx_opt.ts_recent_stamp = 0;
289 		if (likely(!tp->repair))
290 			WRITE_ONCE(tp->write_seq, 0);
291 	}
292 
293 	inet->inet_dport = usin->sin_port;
294 	sk_daddr_set(sk, daddr);
295 
296 	inet_csk(sk)->icsk_ext_hdr_len = 0;
297 	if (inet_opt)
298 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
299 
300 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
301 
302 	/* Socket identity is still unknown (sport may be zero).
303 	 * However we set state to SYN-SENT and not releasing socket
304 	 * lock select source port, enter ourselves into the hash tables and
305 	 * complete initialization after this.
306 	 */
307 	tcp_set_state(sk, TCP_SYN_SENT);
308 	err = inet_hash_connect(tcp_death_row, sk);
309 	if (err)
310 		goto failure;
311 
312 	sk_set_txhash(sk);
313 
314 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
315 			       inet->inet_sport, inet->inet_dport, sk);
316 	if (IS_ERR(rt)) {
317 		err = PTR_ERR(rt);
318 		rt = NULL;
319 		goto failure;
320 	}
321 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
322 	/* OK, now commit destination to socket.  */
323 	sk->sk_gso_type = SKB_GSO_TCPV4;
324 	sk_setup_caps(sk, &rt->dst);
325 	rt = NULL;
326 
327 	if (likely(!tp->repair)) {
328 		if (!tp->write_seq)
329 			WRITE_ONCE(tp->write_seq,
330 				   secure_tcp_seq(inet->inet_saddr,
331 						  inet->inet_daddr,
332 						  inet->inet_sport,
333 						  usin->sin_port));
334 		WRITE_ONCE(tp->tsoffset,
335 			   secure_tcp_ts_off(net, inet->inet_saddr,
336 					     inet->inet_daddr));
337 	}
338 
339 	atomic_set(&inet->inet_id, get_random_u16());
340 
341 	if (tcp_fastopen_defer_connect(sk, &err))
342 		return err;
343 	if (err)
344 		goto failure;
345 
346 	err = tcp_connect(sk);
347 
348 	if (err)
349 		goto failure;
350 
351 	return 0;
352 
353 failure:
354 	/*
355 	 * This unhashes the socket and releases the local port,
356 	 * if necessary.
357 	 */
358 	tcp_set_state(sk, TCP_CLOSE);
359 	inet_bhash2_reset_saddr(sk);
360 	ip_rt_put(rt);
361 	sk->sk_route_caps = 0;
362 	inet->inet_dport = 0;
363 	return err;
364 }
365 EXPORT_IPV6_MOD(tcp_v4_connect);
366 
367 /*
368  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
369  * It can be called through tcp_release_cb() if socket was owned by user
370  * at the time tcp_v4_err() was called to handle ICMP message.
371  */
372 void tcp_v4_mtu_reduced(struct sock *sk)
373 {
374 	struct inet_sock *inet = inet_sk(sk);
375 	struct dst_entry *dst;
376 	u32 mtu;
377 
378 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
379 		return;
380 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
381 	dst = inet_csk_update_pmtu(sk, mtu);
382 	if (!dst)
383 		return;
384 
385 	/* Something is about to be wrong... Remember soft error
386 	 * for the case, if this connection will not able to recover.
387 	 */
388 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
389 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
390 
391 	mtu = dst_mtu(dst);
392 
393 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
394 	    ip_sk_accept_pmtu(sk) &&
395 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
396 		tcp_sync_mss(sk, mtu);
397 
398 		/* Resend the TCP packet because it's
399 		 * clear that the old packet has been
400 		 * dropped. This is the new "fast" path mtu
401 		 * discovery.
402 		 */
403 		tcp_simple_retransmit(sk);
404 	} /* else let the usual retransmit timer handle it */
405 }
406 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
407 
408 static void do_redirect(struct sk_buff *skb, struct sock *sk)
409 {
410 	struct dst_entry *dst = __sk_dst_check(sk, 0);
411 
412 	if (dst)
413 		dst->ops->redirect(dst, sk, skb);
414 }
415 
416 
417 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
418 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
419 {
420 	struct request_sock *req = inet_reqsk(sk);
421 	struct net *net = sock_net(sk);
422 
423 	/* ICMPs are not backlogged, hence we cannot get
424 	 * an established socket here.
425 	 */
426 	if (seq != tcp_rsk(req)->snt_isn) {
427 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
428 	} else if (abort) {
429 		/*
430 		 * Still in SYN_RECV, just remove it silently.
431 		 * There is no good way to pass the error to the newly
432 		 * created socket, and POSIX does not want network
433 		 * errors returned from accept().
434 		 */
435 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
436 		tcp_listendrop(req->rsk_listener);
437 	}
438 	reqsk_put(req);
439 }
440 EXPORT_IPV6_MOD(tcp_req_err);
441 
442 /* TCP-LD (RFC 6069) logic */
443 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
444 {
445 	struct inet_connection_sock *icsk = inet_csk(sk);
446 	struct tcp_sock *tp = tcp_sk(sk);
447 	struct sk_buff *skb;
448 	s32 remaining;
449 	u32 delta_us;
450 
451 	if (sock_owned_by_user(sk))
452 		return;
453 
454 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
455 	    !icsk->icsk_backoff)
456 		return;
457 
458 	skb = tcp_rtx_queue_head(sk);
459 	if (WARN_ON_ONCE(!skb))
460 		return;
461 
462 	icsk->icsk_backoff--;
463 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
464 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
465 
466 	tcp_mstamp_refresh(tp);
467 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
468 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
469 
470 	if (remaining > 0) {
471 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
472 	} else {
473 		/* RTO revert clocked out retransmission.
474 		 * Will retransmit now.
475 		 */
476 		tcp_retransmit_timer(sk);
477 	}
478 }
479 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
480 
481 /*
482  * This routine is called by the ICMP module when it gets some
483  * sort of error condition.  If err < 0 then the socket should
484  * be closed and the error returned to the user.  If err > 0
485  * it's just the icmp type << 8 | icmp code.  After adjustment
486  * header points to the first 8 bytes of the tcp header.  We need
487  * to find the appropriate port.
488  *
489  * The locking strategy used here is very "optimistic". When
490  * someone else accesses the socket the ICMP is just dropped
491  * and for some paths there is no check at all.
492  * A more general error queue to queue errors for later handling
493  * is probably better.
494  *
495  */
496 
497 int tcp_v4_err(struct sk_buff *skb, u32 info)
498 {
499 	const struct iphdr *iph = (const struct iphdr *)skb->data;
500 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
501 	struct net *net = dev_net_rcu(skb->dev);
502 	const int type = icmp_hdr(skb)->type;
503 	const int code = icmp_hdr(skb)->code;
504 	struct request_sock *fastopen;
505 	struct tcp_sock *tp;
506 	u32 seq, snd_una;
507 	struct sock *sk;
508 	int err;
509 
510 	sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
511 				       ntohs(th->source), inet_iif(skb), 0);
512 	if (!sk) {
513 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
514 		return -ENOENT;
515 	}
516 	if (sk->sk_state == TCP_TIME_WAIT) {
517 		/* To increase the counter of ignored icmps for TCP-AO */
518 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
519 		inet_twsk_put(inet_twsk(sk));
520 		return 0;
521 	}
522 	seq = ntohl(th->seq);
523 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
524 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
525 				     type == ICMP_TIME_EXCEEDED ||
526 				     (type == ICMP_DEST_UNREACH &&
527 				      (code == ICMP_NET_UNREACH ||
528 				       code == ICMP_HOST_UNREACH)));
529 		return 0;
530 	}
531 
532 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
533 		sock_put(sk);
534 		return 0;
535 	}
536 
537 	bh_lock_sock(sk);
538 	/* If too many ICMPs get dropped on busy
539 	 * servers this needs to be solved differently.
540 	 * We do take care of PMTU discovery (RFC1191) special case :
541 	 * we can receive locally generated ICMP messages while socket is held.
542 	 */
543 	if (sock_owned_by_user(sk)) {
544 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
545 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
546 	}
547 	if (sk->sk_state == TCP_CLOSE)
548 		goto out;
549 
550 	if (static_branch_unlikely(&ip4_min_ttl)) {
551 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
552 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
553 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
554 			goto out;
555 		}
556 	}
557 
558 	tp = tcp_sk(sk);
559 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
560 	fastopen = rcu_dereference(tp->fastopen_rsk);
561 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
562 	if (sk->sk_state != TCP_LISTEN &&
563 	    !between(seq, snd_una, tp->snd_nxt)) {
564 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
565 		goto out;
566 	}
567 
568 	switch (type) {
569 	case ICMP_REDIRECT:
570 		if (!sock_owned_by_user(sk))
571 			do_redirect(skb, sk);
572 		goto out;
573 	case ICMP_SOURCE_QUENCH:
574 		/* Just silently ignore these. */
575 		goto out;
576 	case ICMP_PARAMETERPROB:
577 		err = EPROTO;
578 		break;
579 	case ICMP_DEST_UNREACH:
580 		if (code > NR_ICMP_UNREACH)
581 			goto out;
582 
583 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
584 			/* We are not interested in TCP_LISTEN and open_requests
585 			 * (SYN-ACKs send out by Linux are always <576bytes so
586 			 * they should go through unfragmented).
587 			 */
588 			if (sk->sk_state == TCP_LISTEN)
589 				goto out;
590 
591 			WRITE_ONCE(tp->mtu_info, info);
592 			if (!sock_owned_by_user(sk)) {
593 				tcp_v4_mtu_reduced(sk);
594 			} else {
595 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
596 					sock_hold(sk);
597 			}
598 			goto out;
599 		}
600 
601 		err = icmp_err_convert[code].errno;
602 		/* check if this ICMP message allows revert of backoff.
603 		 * (see RFC 6069)
604 		 */
605 		if (!fastopen &&
606 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
607 			tcp_ld_RTO_revert(sk, seq);
608 		break;
609 	case ICMP_TIME_EXCEEDED:
610 		err = EHOSTUNREACH;
611 		break;
612 	default:
613 		goto out;
614 	}
615 
616 	switch (sk->sk_state) {
617 	case TCP_SYN_SENT:
618 	case TCP_SYN_RECV:
619 		/* Only in fast or simultaneous open. If a fast open socket is
620 		 * already accepted it is treated as a connected one below.
621 		 */
622 		if (fastopen && !fastopen->sk)
623 			break;
624 
625 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
626 
627 		if (!sock_owned_by_user(sk))
628 			tcp_done_with_error(sk, err);
629 		else
630 			WRITE_ONCE(sk->sk_err_soft, err);
631 		goto out;
632 	}
633 
634 	/* If we've already connected we will keep trying
635 	 * until we time out, or the user gives up.
636 	 *
637 	 * rfc1122 4.2.3.9 allows to consider as hard errors
638 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
639 	 * but it is obsoleted by pmtu discovery).
640 	 *
641 	 * Note, that in modern internet, where routing is unreliable
642 	 * and in each dark corner broken firewalls sit, sending random
643 	 * errors ordered by their masters even this two messages finally lose
644 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
645 	 *
646 	 * Now we are in compliance with RFCs.
647 	 *							--ANK (980905)
648 	 */
649 
650 	if (!sock_owned_by_user(sk) &&
651 	    inet_test_bit(RECVERR, sk)) {
652 		WRITE_ONCE(sk->sk_err, err);
653 		sk_error_report(sk);
654 	} else	{ /* Only an error on timeout */
655 		WRITE_ONCE(sk->sk_err_soft, err);
656 	}
657 
658 out:
659 	bh_unlock_sock(sk);
660 	sock_put(sk);
661 	return 0;
662 }
663 
664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
665 {
666 	struct tcphdr *th = tcp_hdr(skb);
667 
668 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
669 	skb->csum_start = skb_transport_header(skb) - skb->head;
670 	skb->csum_offset = offsetof(struct tcphdr, check);
671 }
672 
673 /* This routine computes an IPv4 TCP checksum. */
674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
675 {
676 	const struct inet_sock *inet = inet_sk(sk);
677 
678 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
679 }
680 EXPORT_IPV6_MOD(tcp_v4_send_check);
681 
682 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
683 
684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
685 				 const struct tcp_ao_hdr *aoh,
686 				 struct ip_reply_arg *arg, struct tcphdr *reply,
687 				 __be32 reply_options[REPLY_OPTIONS_LEN])
688 {
689 #ifdef CONFIG_TCP_AO
690 	int sdif = tcp_v4_sdif(skb);
691 	int dif = inet_iif(skb);
692 	int l3index = sdif ? dif : 0;
693 	bool allocated_traffic_key;
694 	struct tcp_ao_key *key;
695 	char *traffic_key;
696 	bool drop = true;
697 	u32 ao_sne = 0;
698 	u8 keyid;
699 
700 	rcu_read_lock();
701 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
702 				 &key, &traffic_key, &allocated_traffic_key,
703 				 &keyid, &ao_sne))
704 		goto out;
705 
706 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
707 				 (aoh->rnext_keyid << 8) | keyid);
708 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
709 	reply->doff = arg->iov[0].iov_len / 4;
710 
711 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
712 			    key, traffic_key,
713 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
714 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
715 			    reply, ao_sne))
716 		goto out;
717 	drop = false;
718 out:
719 	rcu_read_unlock();
720 	if (allocated_traffic_key)
721 		kfree(traffic_key);
722 	return drop;
723 #else
724 	return true;
725 #endif
726 }
727 
728 /*
729  *	This routine will send an RST to the other tcp.
730  *
731  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
732  *		      for reset.
733  *	Answer: if a packet caused RST, it is not for a socket
734  *		existing in our system, if it is matched to a socket,
735  *		it is just duplicate segment or bug in other side's TCP.
736  *		So that we build reply only basing on parameters
737  *		arrived with segment.
738  *	Exception: precedence violation. We do not implement it in any case.
739  */
740 
741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
742 			      enum sk_rst_reason reason)
743 {
744 	const struct tcphdr *th = tcp_hdr(skb);
745 	struct {
746 		struct tcphdr th;
747 		__be32 opt[REPLY_OPTIONS_LEN];
748 	} rep;
749 	const __u8 *md5_hash_location = NULL;
750 	const struct tcp_ao_hdr *aoh;
751 	struct ip_reply_arg arg;
752 #ifdef CONFIG_TCP_MD5SIG
753 	struct tcp_md5sig_key *key = NULL;
754 	unsigned char newhash[16];
755 	struct sock *sk1 = NULL;
756 	int genhash;
757 #endif
758 	u64 transmit_time = 0;
759 	struct sock *ctl_sk;
760 	struct net *net;
761 	u32 txhash = 0;
762 
763 	/* Never send a reset in response to a reset. */
764 	if (th->rst)
765 		return;
766 
767 	/* If sk not NULL, it means we did a successful lookup and incoming
768 	 * route had to be correct. prequeue might have dropped our dst.
769 	 */
770 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
771 		return;
772 
773 	/* Swap the send and the receive. */
774 	memset(&rep, 0, sizeof(rep));
775 	rep.th.dest   = th->source;
776 	rep.th.source = th->dest;
777 	rep.th.doff   = sizeof(struct tcphdr) / 4;
778 	rep.th.rst    = 1;
779 
780 	if (th->ack) {
781 		rep.th.seq = th->ack_seq;
782 	} else {
783 		rep.th.ack = 1;
784 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
785 				       skb->len - (th->doff << 2));
786 	}
787 
788 	memset(&arg, 0, sizeof(arg));
789 	arg.iov[0].iov_base = (unsigned char *)&rep;
790 	arg.iov[0].iov_len  = sizeof(rep.th);
791 
792 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
793 
794 	/* Invalid TCP option size or twice included auth */
795 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
796 		return;
797 
798 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
799 		return;
800 
801 #ifdef CONFIG_TCP_MD5SIG
802 	rcu_read_lock();
803 	if (sk && sk_fullsock(sk)) {
804 		const union tcp_md5_addr *addr;
805 		int l3index;
806 
807 		/* sdif set, means packet ingressed via a device
808 		 * in an L3 domain and inet_iif is set to it.
809 		 */
810 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
811 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
812 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
813 	} else if (md5_hash_location) {
814 		const union tcp_md5_addr *addr;
815 		int sdif = tcp_v4_sdif(skb);
816 		int dif = inet_iif(skb);
817 		int l3index;
818 
819 		/*
820 		 * active side is lost. Try to find listening socket through
821 		 * source port, and then find md5 key through listening socket.
822 		 * we are not loose security here:
823 		 * Incoming packet is checked with md5 hash with finding key,
824 		 * no RST generated if md5 hash doesn't match.
825 		 */
826 		sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
827 					     th->source, ip_hdr(skb)->daddr,
828 					     ntohs(th->source), dif, sdif);
829 		/* don't send rst if it can't find key */
830 		if (!sk1)
831 			goto out;
832 
833 		/* sdif set, means packet ingressed via a device
834 		 * in an L3 domain and dif is set to it.
835 		 */
836 		l3index = sdif ? dif : 0;
837 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
838 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
839 		if (!key)
840 			goto out;
841 
842 
843 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
844 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
845 			goto out;
846 
847 	}
848 
849 	if (key) {
850 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
851 				   (TCPOPT_NOP << 16) |
852 				   (TCPOPT_MD5SIG << 8) |
853 				   TCPOLEN_MD5SIG);
854 		/* Update length and the length the header thinks exists */
855 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
856 		rep.th.doff = arg.iov[0].iov_len / 4;
857 
858 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
859 				     key, ip_hdr(skb)->saddr,
860 				     ip_hdr(skb)->daddr, &rep.th);
861 	}
862 #endif
863 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
864 	if (rep.opt[0] == 0) {
865 		__be32 mrst = mptcp_reset_option(skb);
866 
867 		if (mrst) {
868 			rep.opt[0] = mrst;
869 			arg.iov[0].iov_len += sizeof(mrst);
870 			rep.th.doff = arg.iov[0].iov_len / 4;
871 		}
872 	}
873 
874 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
875 				      ip_hdr(skb)->saddr, /* XXX */
876 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
877 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
878 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
879 
880 	/* When socket is gone, all binding information is lost.
881 	 * routing might fail in this case. No choice here, if we choose to force
882 	 * input interface, we will misroute in case of asymmetric route.
883 	 */
884 	if (sk)
885 		arg.bound_dev_if = sk->sk_bound_dev_if;
886 
887 	trace_tcp_send_reset(sk, skb, reason);
888 
889 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
890 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
891 
892 	/* ECN bits of TW reset are cleared */
893 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
894 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
895 	local_bh_disable();
896 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
897 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
898 
899 	sock_net_set(ctl_sk, net);
900 	if (sk) {
901 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
902 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
903 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
904 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
905 		transmit_time = tcp_transmit_time(sk);
906 		xfrm_sk_clone_policy(ctl_sk, sk);
907 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
908 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
909 	} else {
910 		ctl_sk->sk_mark = 0;
911 		ctl_sk->sk_priority = 0;
912 	}
913 	ip_send_unicast_reply(ctl_sk, sk,
914 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
915 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
916 			      &arg, arg.iov[0].iov_len,
917 			      transmit_time, txhash);
918 
919 	xfrm_sk_free_policy(ctl_sk);
920 	sock_net_set(ctl_sk, &init_net);
921 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
922 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
923 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
924 	local_bh_enable();
925 
926 #ifdef CONFIG_TCP_MD5SIG
927 out:
928 	rcu_read_unlock();
929 #endif
930 }
931 
932 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
933    outside socket context is ugly, certainly. What can I do?
934  */
935 
936 static void tcp_v4_send_ack(const struct sock *sk,
937 			    struct sk_buff *skb, u32 seq, u32 ack,
938 			    u32 win, u32 tsval, u32 tsecr, int oif,
939 			    struct tcp_key *key,
940 			    int reply_flags, u8 tos, u32 txhash)
941 {
942 	const struct tcphdr *th = tcp_hdr(skb);
943 	struct {
944 		struct tcphdr th;
945 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
946 	} rep;
947 	struct net *net = sock_net(sk);
948 	struct ip_reply_arg arg;
949 	struct sock *ctl_sk;
950 	u64 transmit_time;
951 
952 	memset(&rep.th, 0, sizeof(struct tcphdr));
953 	memset(&arg, 0, sizeof(arg));
954 
955 	arg.iov[0].iov_base = (unsigned char *)&rep;
956 	arg.iov[0].iov_len  = sizeof(rep.th);
957 	if (tsecr) {
958 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
959 				   (TCPOPT_TIMESTAMP << 8) |
960 				   TCPOLEN_TIMESTAMP);
961 		rep.opt[1] = htonl(tsval);
962 		rep.opt[2] = htonl(tsecr);
963 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
964 	}
965 
966 	/* Swap the send and the receive. */
967 	rep.th.dest    = th->source;
968 	rep.th.source  = th->dest;
969 	rep.th.doff    = arg.iov[0].iov_len / 4;
970 	rep.th.seq     = htonl(seq);
971 	rep.th.ack_seq = htonl(ack);
972 	rep.th.ack     = 1;
973 	rep.th.window  = htons(win);
974 
975 #ifdef CONFIG_TCP_MD5SIG
976 	if (tcp_key_is_md5(key)) {
977 		int offset = (tsecr) ? 3 : 0;
978 
979 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
980 					  (TCPOPT_NOP << 16) |
981 					  (TCPOPT_MD5SIG << 8) |
982 					  TCPOLEN_MD5SIG);
983 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
984 		rep.th.doff = arg.iov[0].iov_len/4;
985 
986 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
987 				    key->md5_key, ip_hdr(skb)->saddr,
988 				    ip_hdr(skb)->daddr, &rep.th);
989 	}
990 #endif
991 #ifdef CONFIG_TCP_AO
992 	if (tcp_key_is_ao(key)) {
993 		int offset = (tsecr) ? 3 : 0;
994 
995 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
996 					  (tcp_ao_len(key->ao_key) << 16) |
997 					  (key->ao_key->sndid << 8) |
998 					  key->rcv_next);
999 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
1000 		rep.th.doff = arg.iov[0].iov_len / 4;
1001 
1002 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1003 				key->ao_key, key->traffic_key,
1004 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1005 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1006 				&rep.th, key->sne);
1007 	}
1008 #endif
1009 	arg.flags = reply_flags;
1010 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1011 				      ip_hdr(skb)->saddr, /* XXX */
1012 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1013 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1014 	if (oif)
1015 		arg.bound_dev_if = oif;
1016 	arg.tos = tos;
1017 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1018 	local_bh_disable();
1019 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1020 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1021 	sock_net_set(ctl_sk, net);
1022 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1023 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1024 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1025 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1026 	transmit_time = tcp_transmit_time(sk);
1027 	ip_send_unicast_reply(ctl_sk, sk,
1028 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1029 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1030 			      &arg, arg.iov[0].iov_len,
1031 			      transmit_time, txhash);
1032 
1033 	sock_net_set(ctl_sk, &init_net);
1034 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1035 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1036 	local_bh_enable();
1037 }
1038 
1039 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1040 				enum tcp_tw_status tw_status)
1041 {
1042 	struct inet_timewait_sock *tw = inet_twsk(sk);
1043 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1044 	struct tcp_key key = {};
1045 	u8 tos = tw->tw_tos;
1046 
1047 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1048 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1049 	 * being placed in a different service queues (Classic rather than L4S)
1050 	 */
1051 	if (tw_status == TCP_TW_ACK_OOW)
1052 		tos &= ~INET_ECN_MASK;
1053 
1054 #ifdef CONFIG_TCP_AO
1055 	struct tcp_ao_info *ao_info;
1056 
1057 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1058 		/* FIXME: the segment to-be-acked is not verified yet */
1059 		ao_info = rcu_dereference(tcptw->ao_info);
1060 		if (ao_info) {
1061 			const struct tcp_ao_hdr *aoh;
1062 
1063 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1064 				inet_twsk_put(tw);
1065 				return;
1066 			}
1067 
1068 			if (aoh)
1069 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1070 								    aoh->rnext_keyid, -1);
1071 		}
1072 	}
1073 	if (key.ao_key) {
1074 		struct tcp_ao_key *rnext_key;
1075 
1076 		key.traffic_key = snd_other_key(key.ao_key);
1077 		key.sne = READ_ONCE(ao_info->snd_sne);
1078 		rnext_key = READ_ONCE(ao_info->rnext_key);
1079 		key.rcv_next = rnext_key->rcvid;
1080 		key.type = TCP_KEY_AO;
1081 #else
1082 	if (0) {
1083 #endif
1084 	} else if (static_branch_tcp_md5()) {
1085 		key.md5_key = tcp_twsk_md5_key(tcptw);
1086 		if (key.md5_key)
1087 			key.type = TCP_KEY_MD5;
1088 	}
1089 
1090 	tcp_v4_send_ack(sk, skb,
1091 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1092 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1093 			tcp_tw_tsval(tcptw),
1094 			READ_ONCE(tcptw->tw_ts_recent),
1095 			tw->tw_bound_dev_if, &key,
1096 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1097 			tos,
1098 			tw->tw_txhash);
1099 
1100 	inet_twsk_put(tw);
1101 }
1102 
1103 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1104 				  struct request_sock *req)
1105 {
1106 	struct tcp_key key = {};
1107 
1108 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1109 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1110 	 */
1111 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1112 					     tcp_sk(sk)->snd_nxt;
1113 
1114 #ifdef CONFIG_TCP_AO
1115 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1116 	    tcp_rsk_used_ao(req)) {
1117 		const union tcp_md5_addr *addr;
1118 		const struct tcp_ao_hdr *aoh;
1119 		int l3index;
1120 
1121 		/* Invalid TCP option size or twice included auth */
1122 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1123 			return;
1124 		if (!aoh)
1125 			return;
1126 
1127 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1128 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1129 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1130 					      aoh->rnext_keyid, -1);
1131 		if (unlikely(!key.ao_key)) {
1132 			/* Send ACK with any matching MKT for the peer */
1133 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1134 			/* Matching key disappeared (user removed the key?)
1135 			 * let the handshake timeout.
1136 			 */
1137 			if (!key.ao_key) {
1138 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1139 						     addr,
1140 						     ntohs(tcp_hdr(skb)->source),
1141 						     &ip_hdr(skb)->daddr,
1142 						     ntohs(tcp_hdr(skb)->dest));
1143 				return;
1144 			}
1145 		}
1146 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1147 		if (!key.traffic_key)
1148 			return;
1149 
1150 		key.type = TCP_KEY_AO;
1151 		key.rcv_next = aoh->keyid;
1152 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1153 #else
1154 	if (0) {
1155 #endif
1156 	} else if (static_branch_tcp_md5()) {
1157 		const union tcp_md5_addr *addr;
1158 		int l3index;
1159 
1160 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1161 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1162 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1163 		if (key.md5_key)
1164 			key.type = TCP_KEY_MD5;
1165 	}
1166 
1167 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1168 	tcp_v4_send_ack(sk, skb, seq,
1169 			tcp_rsk(req)->rcv_nxt,
1170 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1171 			tcp_rsk_tsval(tcp_rsk(req)),
1172 			req->ts_recent,
1173 			0, &key,
1174 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1175 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1176 			READ_ONCE(tcp_rsk(req)->txhash));
1177 	if (tcp_key_is_ao(&key))
1178 		kfree(key.traffic_key);
1179 }
1180 
1181 /*
1182  *	Send a SYN-ACK after having received a SYN.
1183  *	This still operates on a request_sock only, not on a big
1184  *	socket.
1185  */
1186 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1187 			      struct flowi *fl,
1188 			      struct request_sock *req,
1189 			      struct tcp_fastopen_cookie *foc,
1190 			      enum tcp_synack_type synack_type,
1191 			      struct sk_buff *syn_skb)
1192 {
1193 	struct inet_request_sock *ireq = inet_rsk(req);
1194 	struct flowi4 fl4;
1195 	int err = -1;
1196 	struct sk_buff *skb;
1197 	u8 tos;
1198 
1199 	/* First, grab a route. */
1200 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1201 		return -1;
1202 
1203 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1204 
1205 	if (skb) {
1206 		tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1207 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1208 
1209 		tos = READ_ONCE(inet_sk(sk)->tos);
1210 
1211 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1212 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1213 			      (tos & INET_ECN_MASK);
1214 
1215 		if (!INET_ECN_is_capable(tos) &&
1216 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1217 			tos |= INET_ECN_ECT_0;
1218 
1219 		rcu_read_lock();
1220 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1221 					    ireq->ir_rmt_addr,
1222 					    rcu_dereference(ireq->ireq_opt),
1223 					    tos);
1224 		rcu_read_unlock();
1225 		err = net_xmit_eval(err);
1226 	}
1227 
1228 	return err;
1229 }
1230 
1231 /*
1232  *	IPv4 request_sock destructor.
1233  */
1234 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1235 {
1236 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1237 }
1238 
1239 #ifdef CONFIG_TCP_MD5SIG
1240 /*
1241  * RFC2385 MD5 checksumming requires a mapping of
1242  * IP address->MD5 Key.
1243  * We need to maintain these in the sk structure.
1244  */
1245 
1246 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1247 EXPORT_IPV6_MOD(tcp_md5_needed);
1248 
1249 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1250 {
1251 	if (!old)
1252 		return true;
1253 
1254 	/* l3index always overrides non-l3index */
1255 	if (old->l3index && new->l3index == 0)
1256 		return false;
1257 	if (old->l3index == 0 && new->l3index)
1258 		return true;
1259 
1260 	return old->prefixlen < new->prefixlen;
1261 }
1262 
1263 /* Find the Key structure for an address.  */
1264 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1265 					   const union tcp_md5_addr *addr,
1266 					   int family, bool any_l3index)
1267 {
1268 	const struct tcp_sock *tp = tcp_sk(sk);
1269 	struct tcp_md5sig_key *key;
1270 	const struct tcp_md5sig_info *md5sig;
1271 	__be32 mask;
1272 	struct tcp_md5sig_key *best_match = NULL;
1273 	bool match;
1274 
1275 	/* caller either holds rcu_read_lock() or socket lock */
1276 	md5sig = rcu_dereference_check(tp->md5sig_info,
1277 				       lockdep_sock_is_held(sk));
1278 	if (!md5sig)
1279 		return NULL;
1280 
1281 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1282 				 lockdep_sock_is_held(sk)) {
1283 		if (key->family != family)
1284 			continue;
1285 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1286 		    key->l3index != l3index)
1287 			continue;
1288 		if (family == AF_INET) {
1289 			mask = inet_make_mask(key->prefixlen);
1290 			match = (key->addr.a4.s_addr & mask) ==
1291 				(addr->a4.s_addr & mask);
1292 #if IS_ENABLED(CONFIG_IPV6)
1293 		} else if (family == AF_INET6) {
1294 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1295 						  key->prefixlen);
1296 #endif
1297 		} else {
1298 			match = false;
1299 		}
1300 
1301 		if (match && better_md5_match(best_match, key))
1302 			best_match = key;
1303 	}
1304 	return best_match;
1305 }
1306 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1307 
1308 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1309 						      const union tcp_md5_addr *addr,
1310 						      int family, u8 prefixlen,
1311 						      int l3index, u8 flags)
1312 {
1313 	const struct tcp_sock *tp = tcp_sk(sk);
1314 	struct tcp_md5sig_key *key;
1315 	unsigned int size = sizeof(struct in_addr);
1316 	const struct tcp_md5sig_info *md5sig;
1317 
1318 	/* caller either holds rcu_read_lock() or socket lock */
1319 	md5sig = rcu_dereference_check(tp->md5sig_info,
1320 				       lockdep_sock_is_held(sk));
1321 	if (!md5sig)
1322 		return NULL;
1323 #if IS_ENABLED(CONFIG_IPV6)
1324 	if (family == AF_INET6)
1325 		size = sizeof(struct in6_addr);
1326 #endif
1327 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1328 				 lockdep_sock_is_held(sk)) {
1329 		if (key->family != family)
1330 			continue;
1331 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1332 			continue;
1333 		if (key->l3index != l3index)
1334 			continue;
1335 		if (!memcmp(&key->addr, addr, size) &&
1336 		    key->prefixlen == prefixlen)
1337 			return key;
1338 	}
1339 	return NULL;
1340 }
1341 
1342 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1343 					 const struct sock *addr_sk)
1344 {
1345 	const union tcp_md5_addr *addr;
1346 	int l3index;
1347 
1348 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1349 						 addr_sk->sk_bound_dev_if);
1350 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1351 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1352 }
1353 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1354 
1355 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1356 {
1357 	struct tcp_sock *tp = tcp_sk(sk);
1358 	struct tcp_md5sig_info *md5sig;
1359 
1360 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1361 	if (!md5sig)
1362 		return -ENOMEM;
1363 
1364 	sk_gso_disable(sk);
1365 	INIT_HLIST_HEAD(&md5sig->head);
1366 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1367 	return 0;
1368 }
1369 
1370 /* This can be called on a newly created socket, from other files */
1371 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1372 			    int family, u8 prefixlen, int l3index, u8 flags,
1373 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1374 {
1375 	/* Add Key to the list */
1376 	struct tcp_md5sig_key *key;
1377 	struct tcp_sock *tp = tcp_sk(sk);
1378 	struct tcp_md5sig_info *md5sig;
1379 
1380 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1381 	if (key) {
1382 		/* Pre-existing entry - just update that one.
1383 		 * Note that the key might be used concurrently.
1384 		 * data_race() is telling kcsan that we do not care of
1385 		 * key mismatches, since changing MD5 key on live flows
1386 		 * can lead to packet drops.
1387 		 */
1388 		data_race(memcpy(key->key, newkey, newkeylen));
1389 
1390 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1391 		 * Also note that a reader could catch new key->keylen value
1392 		 * but old key->key[], this is the reason we use __GFP_ZERO
1393 		 * at sock_kmalloc() time below these lines.
1394 		 */
1395 		WRITE_ONCE(key->keylen, newkeylen);
1396 
1397 		return 0;
1398 	}
1399 
1400 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1401 					   lockdep_sock_is_held(sk));
1402 
1403 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1404 	if (!key)
1405 		return -ENOMEM;
1406 
1407 	memcpy(key->key, newkey, newkeylen);
1408 	key->keylen = newkeylen;
1409 	key->family = family;
1410 	key->prefixlen = prefixlen;
1411 	key->l3index = l3index;
1412 	key->flags = flags;
1413 	memcpy(&key->addr, addr,
1414 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1415 								 sizeof(struct in_addr));
1416 	hlist_add_head_rcu(&key->node, &md5sig->head);
1417 	return 0;
1418 }
1419 
1420 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1421 		   int family, u8 prefixlen, int l3index, u8 flags,
1422 		   const u8 *newkey, u8 newkeylen)
1423 {
1424 	struct tcp_sock *tp = tcp_sk(sk);
1425 
1426 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1427 		if (tcp_md5_alloc_sigpool())
1428 			return -ENOMEM;
1429 
1430 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1431 			tcp_md5_release_sigpool();
1432 			return -ENOMEM;
1433 		}
1434 
1435 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1436 			struct tcp_md5sig_info *md5sig;
1437 
1438 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1439 			rcu_assign_pointer(tp->md5sig_info, NULL);
1440 			kfree_rcu(md5sig, rcu);
1441 			tcp_md5_release_sigpool();
1442 			return -EUSERS;
1443 		}
1444 	}
1445 
1446 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1447 				newkey, newkeylen, GFP_KERNEL);
1448 }
1449 EXPORT_IPV6_MOD(tcp_md5_do_add);
1450 
1451 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1452 		     int family, u8 prefixlen, int l3index,
1453 		     struct tcp_md5sig_key *key)
1454 {
1455 	struct tcp_sock *tp = tcp_sk(sk);
1456 
1457 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1458 		tcp_md5_add_sigpool();
1459 
1460 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1461 			tcp_md5_release_sigpool();
1462 			return -ENOMEM;
1463 		}
1464 
1465 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1466 			struct tcp_md5sig_info *md5sig;
1467 
1468 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1469 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1470 			rcu_assign_pointer(tp->md5sig_info, NULL);
1471 			kfree_rcu(md5sig, rcu);
1472 			tcp_md5_release_sigpool();
1473 			return -EUSERS;
1474 		}
1475 	}
1476 
1477 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1478 				key->flags, key->key, key->keylen,
1479 				sk_gfp_mask(sk, GFP_ATOMIC));
1480 }
1481 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1482 
1483 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1484 		   u8 prefixlen, int l3index, u8 flags)
1485 {
1486 	struct tcp_md5sig_key *key;
1487 
1488 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1489 	if (!key)
1490 		return -ENOENT;
1491 	hlist_del_rcu(&key->node);
1492 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1493 	kfree_rcu(key, rcu);
1494 	return 0;
1495 }
1496 EXPORT_IPV6_MOD(tcp_md5_do_del);
1497 
1498 void tcp_clear_md5_list(struct sock *sk)
1499 {
1500 	struct tcp_sock *tp = tcp_sk(sk);
1501 	struct tcp_md5sig_key *key;
1502 	struct hlist_node *n;
1503 	struct tcp_md5sig_info *md5sig;
1504 
1505 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1506 
1507 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1508 		hlist_del(&key->node);
1509 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1510 		kfree(key);
1511 	}
1512 }
1513 
1514 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1515 				 sockptr_t optval, int optlen)
1516 {
1517 	struct tcp_md5sig cmd;
1518 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1519 	const union tcp_md5_addr *addr;
1520 	u8 prefixlen = 32;
1521 	int l3index = 0;
1522 	bool l3flag;
1523 	u8 flags;
1524 
1525 	if (optlen < sizeof(cmd))
1526 		return -EINVAL;
1527 
1528 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1529 		return -EFAULT;
1530 
1531 	if (sin->sin_family != AF_INET)
1532 		return -EINVAL;
1533 
1534 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1535 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1536 
1537 	if (optname == TCP_MD5SIG_EXT &&
1538 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1539 		prefixlen = cmd.tcpm_prefixlen;
1540 		if (prefixlen > 32)
1541 			return -EINVAL;
1542 	}
1543 
1544 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1545 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1546 		struct net_device *dev;
1547 
1548 		rcu_read_lock();
1549 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1550 		if (dev && netif_is_l3_master(dev))
1551 			l3index = dev->ifindex;
1552 
1553 		rcu_read_unlock();
1554 
1555 		/* ok to reference set/not set outside of rcu;
1556 		 * right now device MUST be an L3 master
1557 		 */
1558 		if (!dev || !l3index)
1559 			return -EINVAL;
1560 	}
1561 
1562 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1563 
1564 	if (!cmd.tcpm_keylen)
1565 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1566 
1567 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1568 		return -EINVAL;
1569 
1570 	/* Don't allow keys for peers that have a matching TCP-AO key.
1571 	 * See the comment in tcp_ao_add_cmd()
1572 	 */
1573 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1574 		return -EKEYREJECTED;
1575 
1576 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1577 			      cmd.tcpm_key, cmd.tcpm_keylen);
1578 }
1579 
1580 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1581 				   __be32 daddr, __be32 saddr,
1582 				   const struct tcphdr *th, int nbytes)
1583 {
1584 	struct tcp4_pseudohdr *bp;
1585 	struct scatterlist sg;
1586 	struct tcphdr *_th;
1587 
1588 	bp = hp->scratch;
1589 	bp->saddr = saddr;
1590 	bp->daddr = daddr;
1591 	bp->pad = 0;
1592 	bp->protocol = IPPROTO_TCP;
1593 	bp->len = cpu_to_be16(nbytes);
1594 
1595 	_th = (struct tcphdr *)(bp + 1);
1596 	memcpy(_th, th, sizeof(*th));
1597 	_th->check = 0;
1598 
1599 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1600 	ahash_request_set_crypt(hp->req, &sg, NULL,
1601 				sizeof(*bp) + sizeof(*th));
1602 	return crypto_ahash_update(hp->req);
1603 }
1604 
1605 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1606 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1607 {
1608 	struct tcp_sigpool hp;
1609 
1610 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1611 		goto clear_hash_nostart;
1612 
1613 	if (crypto_ahash_init(hp.req))
1614 		goto clear_hash;
1615 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1616 		goto clear_hash;
1617 	if (tcp_md5_hash_key(&hp, key))
1618 		goto clear_hash;
1619 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1620 	if (crypto_ahash_final(hp.req))
1621 		goto clear_hash;
1622 
1623 	tcp_sigpool_end(&hp);
1624 	return 0;
1625 
1626 clear_hash:
1627 	tcp_sigpool_end(&hp);
1628 clear_hash_nostart:
1629 	memset(md5_hash, 0, 16);
1630 	return 1;
1631 }
1632 
1633 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1634 			const struct sock *sk,
1635 			const struct sk_buff *skb)
1636 {
1637 	const struct tcphdr *th = tcp_hdr(skb);
1638 	struct tcp_sigpool hp;
1639 	__be32 saddr, daddr;
1640 
1641 	if (sk) { /* valid for establish/request sockets */
1642 		saddr = sk->sk_rcv_saddr;
1643 		daddr = sk->sk_daddr;
1644 	} else {
1645 		const struct iphdr *iph = ip_hdr(skb);
1646 		saddr = iph->saddr;
1647 		daddr = iph->daddr;
1648 	}
1649 
1650 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1651 		goto clear_hash_nostart;
1652 
1653 	if (crypto_ahash_init(hp.req))
1654 		goto clear_hash;
1655 
1656 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1657 		goto clear_hash;
1658 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1659 		goto clear_hash;
1660 	if (tcp_md5_hash_key(&hp, key))
1661 		goto clear_hash;
1662 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1663 	if (crypto_ahash_final(hp.req))
1664 		goto clear_hash;
1665 
1666 	tcp_sigpool_end(&hp);
1667 	return 0;
1668 
1669 clear_hash:
1670 	tcp_sigpool_end(&hp);
1671 clear_hash_nostart:
1672 	memset(md5_hash, 0, 16);
1673 	return 1;
1674 }
1675 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1676 
1677 #endif
1678 
1679 static void tcp_v4_init_req(struct request_sock *req,
1680 			    const struct sock *sk_listener,
1681 			    struct sk_buff *skb)
1682 {
1683 	struct inet_request_sock *ireq = inet_rsk(req);
1684 	struct net *net = sock_net(sk_listener);
1685 
1686 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1687 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1688 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1689 }
1690 
1691 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1692 					  struct sk_buff *skb,
1693 					  struct flowi *fl,
1694 					  struct request_sock *req,
1695 					  u32 tw_isn)
1696 {
1697 	tcp_v4_init_req(req, sk, skb);
1698 
1699 	if (security_inet_conn_request(sk, skb, req))
1700 		return NULL;
1701 
1702 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1703 }
1704 
1705 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1706 	.family		=	PF_INET,
1707 	.obj_size	=	sizeof(struct tcp_request_sock),
1708 	.send_ack	=	tcp_v4_reqsk_send_ack,
1709 	.destructor	=	tcp_v4_reqsk_destructor,
1710 	.send_reset	=	tcp_v4_send_reset,
1711 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1712 };
1713 
1714 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1715 	.mss_clamp	=	TCP_MSS_DEFAULT,
1716 #ifdef CONFIG_TCP_MD5SIG
1717 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1718 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1719 #endif
1720 #ifdef CONFIG_TCP_AO
1721 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1722 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1723 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1724 #endif
1725 #ifdef CONFIG_SYN_COOKIES
1726 	.cookie_init_seq =	cookie_v4_init_sequence,
1727 #endif
1728 	.route_req	=	tcp_v4_route_req,
1729 	.init_seq	=	tcp_v4_init_seq,
1730 	.init_ts_off	=	tcp_v4_init_ts_off,
1731 	.send_synack	=	tcp_v4_send_synack,
1732 };
1733 
1734 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1735 {
1736 	/* Never answer to SYNs send to broadcast or multicast */
1737 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1738 		goto drop;
1739 
1740 	return tcp_conn_request(&tcp_request_sock_ops,
1741 				&tcp_request_sock_ipv4_ops, sk, skb);
1742 
1743 drop:
1744 	tcp_listendrop(sk);
1745 	return 0;
1746 }
1747 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1748 
1749 
1750 /*
1751  * The three way handshake has completed - we got a valid synack -
1752  * now create the new socket.
1753  */
1754 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1755 				  struct request_sock *req,
1756 				  struct dst_entry *dst,
1757 				  struct request_sock *req_unhash,
1758 				  bool *own_req)
1759 {
1760 	struct inet_request_sock *ireq;
1761 	bool found_dup_sk = false;
1762 	struct inet_sock *newinet;
1763 	struct tcp_sock *newtp;
1764 	struct sock *newsk;
1765 #ifdef CONFIG_TCP_MD5SIG
1766 	const union tcp_md5_addr *addr;
1767 	struct tcp_md5sig_key *key;
1768 	int l3index;
1769 #endif
1770 	struct ip_options_rcu *inet_opt;
1771 
1772 	if (sk_acceptq_is_full(sk))
1773 		goto exit_overflow;
1774 
1775 	newsk = tcp_create_openreq_child(sk, req, skb);
1776 	if (!newsk)
1777 		goto exit_nonewsk;
1778 
1779 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1780 	inet_sk_rx_dst_set(newsk, skb);
1781 
1782 	newtp		      = tcp_sk(newsk);
1783 	newinet		      = inet_sk(newsk);
1784 	ireq		      = inet_rsk(req);
1785 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1786 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1787 	newinet->mc_index     = inet_iif(skb);
1788 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1789 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1790 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1791 	if (inet_opt)
1792 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1793 	atomic_set(&newinet->inet_id, get_random_u16());
1794 
1795 	/* Set ToS of the new socket based upon the value of incoming SYN.
1796 	 * ECT bits are set later in tcp_init_transfer().
1797 	 */
1798 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1799 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1800 
1801 	if (!dst) {
1802 		dst = inet_csk_route_child_sock(sk, newsk, req);
1803 		if (!dst)
1804 			goto put_and_exit;
1805 	} else {
1806 		/* syncookie case : see end of cookie_v4_check() */
1807 	}
1808 	sk_setup_caps(newsk, dst);
1809 
1810 	tcp_ca_openreq_child(newsk, dst);
1811 
1812 	tcp_sync_mss(newsk, dst_mtu(dst));
1813 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1814 
1815 	tcp_initialize_rcv_mss(newsk);
1816 
1817 #ifdef CONFIG_TCP_MD5SIG
1818 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1819 	/* Copy over the MD5 key from the original socket */
1820 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1821 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1822 	if (key && !tcp_rsk_used_ao(req)) {
1823 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1824 			goto put_and_exit;
1825 		sk_gso_disable(newsk);
1826 	}
1827 #endif
1828 #ifdef CONFIG_TCP_AO
1829 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1830 		goto put_and_exit; /* OOM, release back memory */
1831 #endif
1832 
1833 	if (__inet_inherit_port(sk, newsk) < 0)
1834 		goto put_and_exit;
1835 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1836 				       &found_dup_sk);
1837 	if (likely(*own_req)) {
1838 		tcp_move_syn(newtp, req);
1839 		ireq->ireq_opt = NULL;
1840 	} else {
1841 		newinet->inet_opt = NULL;
1842 
1843 		if (!req_unhash && found_dup_sk) {
1844 			/* This code path should only be executed in the
1845 			 * syncookie case only
1846 			 */
1847 			bh_unlock_sock(newsk);
1848 			sock_put(newsk);
1849 			newsk = NULL;
1850 		}
1851 	}
1852 	return newsk;
1853 
1854 exit_overflow:
1855 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1856 exit_nonewsk:
1857 	dst_release(dst);
1858 exit:
1859 	tcp_listendrop(sk);
1860 	return NULL;
1861 put_and_exit:
1862 	newinet->inet_opt = NULL;
1863 	inet_csk_prepare_forced_close(newsk);
1864 	tcp_done(newsk);
1865 	goto exit;
1866 }
1867 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1868 
1869 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1870 {
1871 #ifdef CONFIG_SYN_COOKIES
1872 	const struct tcphdr *th = tcp_hdr(skb);
1873 
1874 	if (!th->syn)
1875 		sk = cookie_v4_check(sk, skb);
1876 #endif
1877 	return sk;
1878 }
1879 
1880 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1881 			 struct tcphdr *th, u32 *cookie)
1882 {
1883 	u16 mss = 0;
1884 #ifdef CONFIG_SYN_COOKIES
1885 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1886 				    &tcp_request_sock_ipv4_ops, sk, th);
1887 	if (mss) {
1888 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1889 		tcp_synq_overflow(sk);
1890 	}
1891 #endif
1892 	return mss;
1893 }
1894 
1895 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1896 							   u32));
1897 /* The socket must have it's spinlock held when we get
1898  * here, unless it is a TCP_LISTEN socket.
1899  *
1900  * We have a potential double-lock case here, so even when
1901  * doing backlog processing we use the BH locking scheme.
1902  * This is because we cannot sleep with the original spinlock
1903  * held.
1904  */
1905 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1906 {
1907 	enum skb_drop_reason reason;
1908 	struct sock *rsk;
1909 
1910 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1911 		struct dst_entry *dst;
1912 
1913 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1914 						lockdep_sock_is_held(sk));
1915 
1916 		sock_rps_save_rxhash(sk, skb);
1917 		sk_mark_napi_id(sk, skb);
1918 		if (dst) {
1919 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1920 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1921 					     dst, 0)) {
1922 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1923 				dst_release(dst);
1924 			}
1925 		}
1926 		tcp_rcv_established(sk, skb);
1927 		return 0;
1928 	}
1929 
1930 	if (tcp_checksum_complete(skb))
1931 		goto csum_err;
1932 
1933 	if (sk->sk_state == TCP_LISTEN) {
1934 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1935 
1936 		if (!nsk)
1937 			return 0;
1938 		if (nsk != sk) {
1939 			reason = tcp_child_process(sk, nsk, skb);
1940 			if (reason) {
1941 				rsk = nsk;
1942 				goto reset;
1943 			}
1944 			return 0;
1945 		}
1946 	} else
1947 		sock_rps_save_rxhash(sk, skb);
1948 
1949 	reason = tcp_rcv_state_process(sk, skb);
1950 	if (reason) {
1951 		rsk = sk;
1952 		goto reset;
1953 	}
1954 	return 0;
1955 
1956 reset:
1957 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1958 discard:
1959 	sk_skb_reason_drop(sk, skb, reason);
1960 	/* Be careful here. If this function gets more complicated and
1961 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1962 	 * might be destroyed here. This current version compiles correctly,
1963 	 * but you have been warned.
1964 	 */
1965 	return 0;
1966 
1967 csum_err:
1968 	reason = SKB_DROP_REASON_TCP_CSUM;
1969 	trace_tcp_bad_csum(skb);
1970 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1971 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1972 	goto discard;
1973 }
1974 EXPORT_SYMBOL(tcp_v4_do_rcv);
1975 
1976 int tcp_v4_early_demux(struct sk_buff *skb)
1977 {
1978 	struct net *net = dev_net_rcu(skb->dev);
1979 	const struct iphdr *iph;
1980 	const struct tcphdr *th;
1981 	struct sock *sk;
1982 
1983 	if (skb->pkt_type != PACKET_HOST)
1984 		return 0;
1985 
1986 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1987 		return 0;
1988 
1989 	iph = ip_hdr(skb);
1990 	th = tcp_hdr(skb);
1991 
1992 	if (th->doff < sizeof(struct tcphdr) / 4)
1993 		return 0;
1994 
1995 	sk = __inet_lookup_established(net, iph->saddr, th->source,
1996 				       iph->daddr, ntohs(th->dest),
1997 				       skb->skb_iif, inet_sdif(skb));
1998 	if (sk) {
1999 		skb->sk = sk;
2000 		skb->destructor = sock_edemux;
2001 		if (sk_fullsock(sk)) {
2002 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2003 
2004 			if (dst)
2005 				dst = dst_check(dst, 0);
2006 			if (dst &&
2007 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
2008 				skb_dst_set_noref(skb, dst);
2009 		}
2010 	}
2011 	return 0;
2012 }
2013 
2014 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2015 		     enum skb_drop_reason *reason)
2016 {
2017 	u32 tail_gso_size, tail_gso_segs;
2018 	struct skb_shared_info *shinfo;
2019 	const struct tcphdr *th;
2020 	struct tcphdr *thtail;
2021 	struct sk_buff *tail;
2022 	unsigned int hdrlen;
2023 	bool fragstolen;
2024 	u32 gso_segs;
2025 	u32 gso_size;
2026 	u64 limit;
2027 	int delta;
2028 	int err;
2029 
2030 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2031 	 * we can fix skb->truesize to its real value to avoid future drops.
2032 	 * This is valid because skb is not yet charged to the socket.
2033 	 * It has been noticed pure SACK packets were sometimes dropped
2034 	 * (if cooked by drivers without copybreak feature).
2035 	 */
2036 	skb_condense(skb);
2037 
2038 	tcp_cleanup_skb(skb);
2039 
2040 	if (unlikely(tcp_checksum_complete(skb))) {
2041 		bh_unlock_sock(sk);
2042 		trace_tcp_bad_csum(skb);
2043 		*reason = SKB_DROP_REASON_TCP_CSUM;
2044 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2045 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2046 		return true;
2047 	}
2048 
2049 	/* Attempt coalescing to last skb in backlog, even if we are
2050 	 * above the limits.
2051 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2052 	 */
2053 	th = (const struct tcphdr *)skb->data;
2054 	hdrlen = th->doff * 4;
2055 
2056 	tail = sk->sk_backlog.tail;
2057 	if (!tail)
2058 		goto no_coalesce;
2059 	thtail = (struct tcphdr *)tail->data;
2060 
2061 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2062 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2063 	    ((TCP_SKB_CB(tail)->tcp_flags |
2064 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2065 	    !((TCP_SKB_CB(tail)->tcp_flags &
2066 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2067 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2068 	      TCP_SKB_CB(skb)->tcp_flags) &
2069 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2070 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2071 	    thtail->doff != th->doff ||
2072 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2073 		goto no_coalesce;
2074 
2075 	__skb_pull(skb, hdrlen);
2076 
2077 	shinfo = skb_shinfo(skb);
2078 	gso_size = shinfo->gso_size ?: skb->len;
2079 	gso_segs = shinfo->gso_segs ?: 1;
2080 
2081 	shinfo = skb_shinfo(tail);
2082 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2083 	tail_gso_segs = shinfo->gso_segs ?: 1;
2084 
2085 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2086 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2087 
2088 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2089 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2090 			thtail->window = th->window;
2091 		}
2092 
2093 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2094 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2095 		 * is not entered if we append a packet with a FIN.
2096 		 * SYN, RST, URG are not present.
2097 		 * ACK is set on both packets.
2098 		 * PSH : we do not really care in TCP stack,
2099 		 *       at least for 'GRO' packets.
2100 		 */
2101 		thtail->fin |= th->fin;
2102 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2103 
2104 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2105 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2106 			tail->tstamp = skb->tstamp;
2107 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2108 		}
2109 
2110 		/* Not as strict as GRO. We only need to carry mss max value */
2111 		shinfo->gso_size = max(gso_size, tail_gso_size);
2112 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2113 
2114 		sk->sk_backlog.len += delta;
2115 		__NET_INC_STATS(sock_net(sk),
2116 				LINUX_MIB_TCPBACKLOGCOALESCE);
2117 		kfree_skb_partial(skb, fragstolen);
2118 		return false;
2119 	}
2120 	__skb_push(skb, hdrlen);
2121 
2122 no_coalesce:
2123 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2124 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2125 	 * sk_rcvbuf in normal conditions.
2126 	 */
2127 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2128 
2129 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2130 
2131 	/* Only socket owner can try to collapse/prune rx queues
2132 	 * to reduce memory overhead, so add a little headroom here.
2133 	 * Few sockets backlog are possibly concurrently non empty.
2134 	 */
2135 	limit += 64 * 1024;
2136 
2137 	limit = min_t(u64, limit, UINT_MAX);
2138 
2139 	err = sk_add_backlog(sk, skb, limit);
2140 	if (unlikely(err)) {
2141 		bh_unlock_sock(sk);
2142 		if (err == -ENOMEM) {
2143 			*reason = SKB_DROP_REASON_PFMEMALLOC;
2144 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2145 		} else {
2146 			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2147 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2148 		}
2149 		return true;
2150 	}
2151 	return false;
2152 }
2153 EXPORT_IPV6_MOD(tcp_add_backlog);
2154 
2155 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
2156 {
2157 	struct tcphdr *th = (struct tcphdr *)skb->data;
2158 
2159 	return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
2160 }
2161 EXPORT_IPV6_MOD(tcp_filter);
2162 
2163 static void tcp_v4_restore_cb(struct sk_buff *skb)
2164 {
2165 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2166 		sizeof(struct inet_skb_parm));
2167 }
2168 
2169 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2170 			   const struct tcphdr *th)
2171 {
2172 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2173 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2174 	 */
2175 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2176 		sizeof(struct inet_skb_parm));
2177 	barrier();
2178 
2179 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2180 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2181 				    skb->len - th->doff * 4);
2182 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2183 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2184 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2185 	TCP_SKB_CB(skb)->sacked	 = 0;
2186 	TCP_SKB_CB(skb)->has_rxtstamp =
2187 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2188 }
2189 
2190 /*
2191  *	From tcp_input.c
2192  */
2193 
2194 int tcp_v4_rcv(struct sk_buff *skb)
2195 {
2196 	struct net *net = dev_net_rcu(skb->dev);
2197 	enum skb_drop_reason drop_reason;
2198 	enum tcp_tw_status tw_status;
2199 	int sdif = inet_sdif(skb);
2200 	int dif = inet_iif(skb);
2201 	const struct iphdr *iph;
2202 	const struct tcphdr *th;
2203 	struct sock *sk = NULL;
2204 	bool refcounted;
2205 	int ret;
2206 	u32 isn;
2207 
2208 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2209 	if (skb->pkt_type != PACKET_HOST)
2210 		goto discard_it;
2211 
2212 	/* Count it even if it's bad */
2213 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2214 
2215 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2216 		goto discard_it;
2217 
2218 	th = (const struct tcphdr *)skb->data;
2219 
2220 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2221 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2222 		goto bad_packet;
2223 	}
2224 	if (!pskb_may_pull(skb, th->doff * 4))
2225 		goto discard_it;
2226 
2227 	/* An explanation is required here, I think.
2228 	 * Packet length and doff are validated by header prediction,
2229 	 * provided case of th->doff==0 is eliminated.
2230 	 * So, we defer the checks. */
2231 
2232 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2233 		goto csum_error;
2234 
2235 	th = (const struct tcphdr *)skb->data;
2236 	iph = ip_hdr(skb);
2237 lookup:
2238 	sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2239 			       th->dest, sdif, &refcounted);
2240 	if (!sk)
2241 		goto no_tcp_socket;
2242 
2243 	if (sk->sk_state == TCP_TIME_WAIT)
2244 		goto do_time_wait;
2245 
2246 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2247 		struct request_sock *req = inet_reqsk(sk);
2248 		bool req_stolen = false;
2249 		struct sock *nsk;
2250 
2251 		sk = req->rsk_listener;
2252 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2253 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2254 		else
2255 			drop_reason = tcp_inbound_hash(sk, req, skb,
2256 						       &iph->saddr, &iph->daddr,
2257 						       AF_INET, dif, sdif);
2258 		if (unlikely(drop_reason)) {
2259 			sk_drops_skbadd(sk, skb);
2260 			reqsk_put(req);
2261 			goto discard_it;
2262 		}
2263 		if (tcp_checksum_complete(skb)) {
2264 			reqsk_put(req);
2265 			goto csum_error;
2266 		}
2267 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2268 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2269 			if (!nsk) {
2270 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2271 				goto lookup;
2272 			}
2273 			sk = nsk;
2274 			/* reuseport_migrate_sock() has already held one sk_refcnt
2275 			 * before returning.
2276 			 */
2277 		} else {
2278 			/* We own a reference on the listener, increase it again
2279 			 * as we might lose it too soon.
2280 			 */
2281 			sock_hold(sk);
2282 		}
2283 		refcounted = true;
2284 		nsk = NULL;
2285 		if (!tcp_filter(sk, skb, &drop_reason)) {
2286 			th = (const struct tcphdr *)skb->data;
2287 			iph = ip_hdr(skb);
2288 			tcp_v4_fill_cb(skb, iph, th);
2289 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2290 					    &drop_reason);
2291 		}
2292 		if (!nsk) {
2293 			reqsk_put(req);
2294 			if (req_stolen) {
2295 				/* Another cpu got exclusive access to req
2296 				 * and created a full blown socket.
2297 				 * Try to feed this packet to this socket
2298 				 * instead of discarding it.
2299 				 */
2300 				tcp_v4_restore_cb(skb);
2301 				sock_put(sk);
2302 				goto lookup;
2303 			}
2304 			goto discard_and_relse;
2305 		}
2306 		nf_reset_ct(skb);
2307 		if (nsk == sk) {
2308 			reqsk_put(req);
2309 			tcp_v4_restore_cb(skb);
2310 		} else {
2311 			drop_reason = tcp_child_process(sk, nsk, skb);
2312 			if (drop_reason) {
2313 				enum sk_rst_reason rst_reason;
2314 
2315 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2316 				tcp_v4_send_reset(nsk, skb, rst_reason);
2317 				goto discard_and_relse;
2318 			}
2319 			sock_put(sk);
2320 			return 0;
2321 		}
2322 	}
2323 
2324 process:
2325 	if (static_branch_unlikely(&ip4_min_ttl)) {
2326 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2327 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2328 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2329 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2330 			goto discard_and_relse;
2331 		}
2332 	}
2333 
2334 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2335 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2336 		goto discard_and_relse;
2337 	}
2338 
2339 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2340 				       AF_INET, dif, sdif);
2341 	if (drop_reason)
2342 		goto discard_and_relse;
2343 
2344 	nf_reset_ct(skb);
2345 
2346 	if (tcp_filter(sk, skb, &drop_reason))
2347 		goto discard_and_relse;
2348 
2349 	th = (const struct tcphdr *)skb->data;
2350 	iph = ip_hdr(skb);
2351 	tcp_v4_fill_cb(skb, iph, th);
2352 
2353 	skb->dev = NULL;
2354 
2355 	if (sk->sk_state == TCP_LISTEN) {
2356 		ret = tcp_v4_do_rcv(sk, skb);
2357 		goto put_and_return;
2358 	}
2359 
2360 	sk_incoming_cpu_update(sk);
2361 
2362 	bh_lock_sock_nested(sk);
2363 	tcp_segs_in(tcp_sk(sk), skb);
2364 	ret = 0;
2365 	if (!sock_owned_by_user(sk)) {
2366 		ret = tcp_v4_do_rcv(sk, skb);
2367 	} else {
2368 		if (tcp_add_backlog(sk, skb, &drop_reason))
2369 			goto discard_and_relse;
2370 	}
2371 	bh_unlock_sock(sk);
2372 
2373 put_and_return:
2374 	if (refcounted)
2375 		sock_put(sk);
2376 
2377 	return ret;
2378 
2379 no_tcp_socket:
2380 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2381 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2382 		goto discard_it;
2383 
2384 	tcp_v4_fill_cb(skb, iph, th);
2385 
2386 	if (tcp_checksum_complete(skb)) {
2387 csum_error:
2388 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2389 		trace_tcp_bad_csum(skb);
2390 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2391 bad_packet:
2392 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2393 	} else {
2394 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2395 	}
2396 
2397 discard_it:
2398 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2399 	/* Discard frame. */
2400 	sk_skb_reason_drop(sk, skb, drop_reason);
2401 	return 0;
2402 
2403 discard_and_relse:
2404 	sk_drops_skbadd(sk, skb);
2405 	if (refcounted)
2406 		sock_put(sk);
2407 	goto discard_it;
2408 
2409 do_time_wait:
2410 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2411 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2412 		inet_twsk_put(inet_twsk(sk));
2413 		goto discard_it;
2414 	}
2415 
2416 	tcp_v4_fill_cb(skb, iph, th);
2417 
2418 	if (tcp_checksum_complete(skb)) {
2419 		inet_twsk_put(inet_twsk(sk));
2420 		goto csum_error;
2421 	}
2422 
2423 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2424 					       &drop_reason);
2425 	switch (tw_status) {
2426 	case TCP_TW_SYN: {
2427 		struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2428 							iph->saddr, th->source,
2429 							iph->daddr, th->dest,
2430 							inet_iif(skb),
2431 							sdif);
2432 		if (sk2) {
2433 			inet_twsk_deschedule_put(inet_twsk(sk));
2434 			sk = sk2;
2435 			tcp_v4_restore_cb(skb);
2436 			refcounted = false;
2437 			__this_cpu_write(tcp_tw_isn, isn);
2438 			goto process;
2439 		}
2440 	}
2441 		/* to ACK */
2442 		fallthrough;
2443 	case TCP_TW_ACK:
2444 	case TCP_TW_ACK_OOW:
2445 		tcp_v4_timewait_ack(sk, skb, tw_status);
2446 		break;
2447 	case TCP_TW_RST:
2448 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2449 		inet_twsk_deschedule_put(inet_twsk(sk));
2450 		goto discard_it;
2451 	case TCP_TW_SUCCESS:;
2452 	}
2453 	goto discard_it;
2454 }
2455 
2456 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2457 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2458 };
2459 
2460 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2461 {
2462 	struct dst_entry *dst = skb_dst(skb);
2463 
2464 	if (dst && dst_hold_safe(dst)) {
2465 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2466 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2467 	}
2468 }
2469 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2470 
2471 const struct inet_connection_sock_af_ops ipv4_specific = {
2472 	.queue_xmit	   = ip_queue_xmit,
2473 	.send_check	   = tcp_v4_send_check,
2474 	.rebuild_header	   = inet_sk_rebuild_header,
2475 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2476 	.conn_request	   = tcp_v4_conn_request,
2477 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2478 	.net_header_len	   = sizeof(struct iphdr),
2479 	.setsockopt	   = ip_setsockopt,
2480 	.getsockopt	   = ip_getsockopt,
2481 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2482 };
2483 EXPORT_IPV6_MOD(ipv4_specific);
2484 
2485 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2486 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2487 #ifdef CONFIG_TCP_MD5SIG
2488 	.md5_lookup		= tcp_v4_md5_lookup,
2489 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2490 	.md5_parse		= tcp_v4_parse_md5_keys,
2491 #endif
2492 #ifdef CONFIG_TCP_AO
2493 	.ao_lookup		= tcp_v4_ao_lookup,
2494 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2495 	.ao_parse		= tcp_v4_parse_ao,
2496 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2497 #endif
2498 };
2499 
2500 static void tcp4_destruct_sock(struct sock *sk)
2501 {
2502 	tcp_md5_destruct_sock(sk);
2503 	tcp_ao_destroy_sock(sk, false);
2504 	inet_sock_destruct(sk);
2505 }
2506 #endif
2507 
2508 /* NOTE: A lot of things set to zero explicitly by call to
2509  *       sk_alloc() so need not be done here.
2510  */
2511 static int tcp_v4_init_sock(struct sock *sk)
2512 {
2513 	struct inet_connection_sock *icsk = inet_csk(sk);
2514 
2515 	tcp_init_sock(sk);
2516 
2517 	icsk->icsk_af_ops = &ipv4_specific;
2518 
2519 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2520 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2521 	sk->sk_destruct = tcp4_destruct_sock;
2522 #endif
2523 
2524 	return 0;
2525 }
2526 
2527 static void tcp_release_user_frags(struct sock *sk)
2528 {
2529 #ifdef CONFIG_PAGE_POOL
2530 	unsigned long index;
2531 	void *netmem;
2532 
2533 	xa_for_each(&sk->sk_user_frags, index, netmem)
2534 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2535 #endif
2536 }
2537 
2538 void tcp_v4_destroy_sock(struct sock *sk)
2539 {
2540 	struct tcp_sock *tp = tcp_sk(sk);
2541 
2542 	tcp_release_user_frags(sk);
2543 
2544 	xa_destroy(&sk->sk_user_frags);
2545 
2546 	trace_tcp_destroy_sock(sk);
2547 
2548 	tcp_clear_xmit_timers(sk);
2549 
2550 	tcp_cleanup_congestion_control(sk);
2551 
2552 	tcp_cleanup_ulp(sk);
2553 
2554 	/* Cleanup up the write buffer. */
2555 	tcp_write_queue_purge(sk);
2556 
2557 	/* Check if we want to disable active TFO */
2558 	tcp_fastopen_active_disable_ofo_check(sk);
2559 
2560 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2561 	skb_rbtree_purge(&tp->out_of_order_queue);
2562 
2563 	/* Clean up a referenced TCP bind bucket. */
2564 	if (inet_csk(sk)->icsk_bind_hash)
2565 		inet_put_port(sk);
2566 
2567 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2568 
2569 	/* If socket is aborted during connect operation */
2570 	tcp_free_fastopen_req(tp);
2571 	tcp_fastopen_destroy_cipher(sk);
2572 	tcp_saved_syn_free(tp);
2573 
2574 	sk_sockets_allocated_dec(sk);
2575 }
2576 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2577 
2578 #ifdef CONFIG_PROC_FS
2579 /* Proc filesystem TCP sock list dumping. */
2580 
2581 static unsigned short seq_file_family(const struct seq_file *seq);
2582 
2583 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2584 {
2585 	unsigned short family = seq_file_family(seq);
2586 
2587 	/* AF_UNSPEC is used as a match all */
2588 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2589 		net_eq(sock_net(sk), seq_file_net(seq)));
2590 }
2591 
2592 /* Find a non empty bucket (starting from st->bucket)
2593  * and return the first sk from it.
2594  */
2595 static void *listening_get_first(struct seq_file *seq)
2596 {
2597 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2598 	struct tcp_iter_state *st = seq->private;
2599 
2600 	st->offset = 0;
2601 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2602 		struct inet_listen_hashbucket *ilb2;
2603 		struct hlist_nulls_node *node;
2604 		struct sock *sk;
2605 
2606 		ilb2 = &hinfo->lhash2[st->bucket];
2607 		if (hlist_nulls_empty(&ilb2->nulls_head))
2608 			continue;
2609 
2610 		spin_lock(&ilb2->lock);
2611 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2612 			if (seq_sk_match(seq, sk))
2613 				return sk;
2614 		}
2615 		spin_unlock(&ilb2->lock);
2616 	}
2617 
2618 	return NULL;
2619 }
2620 
2621 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2622  * If "cur" is the last one in the st->bucket,
2623  * call listening_get_first() to return the first sk of the next
2624  * non empty bucket.
2625  */
2626 static void *listening_get_next(struct seq_file *seq, void *cur)
2627 {
2628 	struct tcp_iter_state *st = seq->private;
2629 	struct inet_listen_hashbucket *ilb2;
2630 	struct hlist_nulls_node *node;
2631 	struct inet_hashinfo *hinfo;
2632 	struct sock *sk = cur;
2633 
2634 	++st->num;
2635 	++st->offset;
2636 
2637 	sk = sk_nulls_next(sk);
2638 	sk_nulls_for_each_from(sk, node) {
2639 		if (seq_sk_match(seq, sk))
2640 			return sk;
2641 	}
2642 
2643 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2644 	ilb2 = &hinfo->lhash2[st->bucket];
2645 	spin_unlock(&ilb2->lock);
2646 	++st->bucket;
2647 	return listening_get_first(seq);
2648 }
2649 
2650 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2651 {
2652 	struct tcp_iter_state *st = seq->private;
2653 	void *rc;
2654 
2655 	st->bucket = 0;
2656 	st->offset = 0;
2657 	rc = listening_get_first(seq);
2658 
2659 	while (rc && *pos) {
2660 		rc = listening_get_next(seq, rc);
2661 		--*pos;
2662 	}
2663 	return rc;
2664 }
2665 
2666 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2667 				const struct tcp_iter_state *st)
2668 {
2669 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2670 }
2671 
2672 /*
2673  * Get first established socket starting from bucket given in st->bucket.
2674  * If st->bucket is zero, the very first socket in the hash is returned.
2675  */
2676 static void *established_get_first(struct seq_file *seq)
2677 {
2678 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2679 	struct tcp_iter_state *st = seq->private;
2680 
2681 	st->offset = 0;
2682 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2683 		struct sock *sk;
2684 		struct hlist_nulls_node *node;
2685 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2686 
2687 		cond_resched();
2688 
2689 		/* Lockless fast path for the common case of empty buckets */
2690 		if (empty_bucket(hinfo, st))
2691 			continue;
2692 
2693 		spin_lock_bh(lock);
2694 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2695 			if (seq_sk_match(seq, sk))
2696 				return sk;
2697 		}
2698 		spin_unlock_bh(lock);
2699 	}
2700 
2701 	return NULL;
2702 }
2703 
2704 static void *established_get_next(struct seq_file *seq, void *cur)
2705 {
2706 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2707 	struct tcp_iter_state *st = seq->private;
2708 	struct hlist_nulls_node *node;
2709 	struct sock *sk = cur;
2710 
2711 	++st->num;
2712 	++st->offset;
2713 
2714 	sk = sk_nulls_next(sk);
2715 
2716 	sk_nulls_for_each_from(sk, node) {
2717 		if (seq_sk_match(seq, sk))
2718 			return sk;
2719 	}
2720 
2721 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2722 	++st->bucket;
2723 	return established_get_first(seq);
2724 }
2725 
2726 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2727 {
2728 	struct tcp_iter_state *st = seq->private;
2729 	void *rc;
2730 
2731 	st->bucket = 0;
2732 	rc = established_get_first(seq);
2733 
2734 	while (rc && pos) {
2735 		rc = established_get_next(seq, rc);
2736 		--pos;
2737 	}
2738 	return rc;
2739 }
2740 
2741 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2742 {
2743 	void *rc;
2744 	struct tcp_iter_state *st = seq->private;
2745 
2746 	st->state = TCP_SEQ_STATE_LISTENING;
2747 	rc	  = listening_get_idx(seq, &pos);
2748 
2749 	if (!rc) {
2750 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2751 		rc	  = established_get_idx(seq, pos);
2752 	}
2753 
2754 	return rc;
2755 }
2756 
2757 static void *tcp_seek_last_pos(struct seq_file *seq)
2758 {
2759 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2760 	struct tcp_iter_state *st = seq->private;
2761 	int bucket = st->bucket;
2762 	int offset = st->offset;
2763 	int orig_num = st->num;
2764 	void *rc = NULL;
2765 
2766 	switch (st->state) {
2767 	case TCP_SEQ_STATE_LISTENING:
2768 		if (st->bucket > hinfo->lhash2_mask)
2769 			break;
2770 		rc = listening_get_first(seq);
2771 		while (offset-- && rc && bucket == st->bucket)
2772 			rc = listening_get_next(seq, rc);
2773 		if (rc)
2774 			break;
2775 		st->bucket = 0;
2776 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2777 		fallthrough;
2778 	case TCP_SEQ_STATE_ESTABLISHED:
2779 		if (st->bucket > hinfo->ehash_mask)
2780 			break;
2781 		rc = established_get_first(seq);
2782 		while (offset-- && rc && bucket == st->bucket)
2783 			rc = established_get_next(seq, rc);
2784 	}
2785 
2786 	st->num = orig_num;
2787 
2788 	return rc;
2789 }
2790 
2791 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2792 {
2793 	struct tcp_iter_state *st = seq->private;
2794 	void *rc;
2795 
2796 	if (*pos && *pos == st->last_pos) {
2797 		rc = tcp_seek_last_pos(seq);
2798 		if (rc)
2799 			goto out;
2800 	}
2801 
2802 	st->state = TCP_SEQ_STATE_LISTENING;
2803 	st->num = 0;
2804 	st->bucket = 0;
2805 	st->offset = 0;
2806 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2807 
2808 out:
2809 	st->last_pos = *pos;
2810 	return rc;
2811 }
2812 EXPORT_IPV6_MOD(tcp_seq_start);
2813 
2814 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2815 {
2816 	struct tcp_iter_state *st = seq->private;
2817 	void *rc = NULL;
2818 
2819 	if (v == SEQ_START_TOKEN) {
2820 		rc = tcp_get_idx(seq, 0);
2821 		goto out;
2822 	}
2823 
2824 	switch (st->state) {
2825 	case TCP_SEQ_STATE_LISTENING:
2826 		rc = listening_get_next(seq, v);
2827 		if (!rc) {
2828 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2829 			st->bucket = 0;
2830 			st->offset = 0;
2831 			rc	  = established_get_first(seq);
2832 		}
2833 		break;
2834 	case TCP_SEQ_STATE_ESTABLISHED:
2835 		rc = established_get_next(seq, v);
2836 		break;
2837 	}
2838 out:
2839 	++*pos;
2840 	st->last_pos = *pos;
2841 	return rc;
2842 }
2843 EXPORT_IPV6_MOD(tcp_seq_next);
2844 
2845 void tcp_seq_stop(struct seq_file *seq, void *v)
2846 {
2847 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2848 	struct tcp_iter_state *st = seq->private;
2849 
2850 	switch (st->state) {
2851 	case TCP_SEQ_STATE_LISTENING:
2852 		if (v != SEQ_START_TOKEN)
2853 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2854 		break;
2855 	case TCP_SEQ_STATE_ESTABLISHED:
2856 		if (v)
2857 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2858 		break;
2859 	}
2860 }
2861 EXPORT_IPV6_MOD(tcp_seq_stop);
2862 
2863 static void get_openreq4(const struct request_sock *req,
2864 			 struct seq_file *f, int i)
2865 {
2866 	const struct inet_request_sock *ireq = inet_rsk(req);
2867 	long delta = req->rsk_timer.expires - jiffies;
2868 
2869 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2870 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2871 		i,
2872 		ireq->ir_loc_addr,
2873 		ireq->ir_num,
2874 		ireq->ir_rmt_addr,
2875 		ntohs(ireq->ir_rmt_port),
2876 		TCP_SYN_RECV,
2877 		0, 0, /* could print option size, but that is af dependent. */
2878 		1,    /* timers active (only the expire timer) */
2879 		jiffies_delta_to_clock_t(delta),
2880 		req->num_timeout,
2881 		from_kuid_munged(seq_user_ns(f),
2882 				 sk_uid(req->rsk_listener)),
2883 		0,  /* non standard timer */
2884 		0, /* open_requests have no inode */
2885 		0,
2886 		req);
2887 }
2888 
2889 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2890 {
2891 	int timer_active;
2892 	unsigned long timer_expires;
2893 	const struct tcp_sock *tp = tcp_sk(sk);
2894 	const struct inet_connection_sock *icsk = inet_csk(sk);
2895 	const struct inet_sock *inet = inet_sk(sk);
2896 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2897 	__be32 dest = inet->inet_daddr;
2898 	__be32 src = inet->inet_rcv_saddr;
2899 	__u16 destp = ntohs(inet->inet_dport);
2900 	__u16 srcp = ntohs(inet->inet_sport);
2901 	u8 icsk_pending;
2902 	int rx_queue;
2903 	int state;
2904 
2905 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2906 	if (icsk_pending == ICSK_TIME_RETRANS ||
2907 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2908 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2909 		timer_active	= 1;
2910 		timer_expires	= icsk_timeout(icsk);
2911 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2912 		timer_active	= 4;
2913 		timer_expires	= icsk_timeout(icsk);
2914 	} else if (timer_pending(&sk->sk_timer)) {
2915 		timer_active	= 2;
2916 		timer_expires	= sk->sk_timer.expires;
2917 	} else {
2918 		timer_active	= 0;
2919 		timer_expires = jiffies;
2920 	}
2921 
2922 	state = inet_sk_state_load(sk);
2923 	if (state == TCP_LISTEN)
2924 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2925 	else
2926 		/* Because we don't lock the socket,
2927 		 * we might find a transient negative value.
2928 		 */
2929 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2930 				      READ_ONCE(tp->copied_seq), 0);
2931 
2932 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2933 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2934 		i, src, srcp, dest, destp, state,
2935 		READ_ONCE(tp->write_seq) - tp->snd_una,
2936 		rx_queue,
2937 		timer_active,
2938 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2939 		READ_ONCE(icsk->icsk_retransmits),
2940 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2941 		READ_ONCE(icsk->icsk_probes_out),
2942 		sock_i_ino(sk),
2943 		refcount_read(&sk->sk_refcnt), sk,
2944 		jiffies_to_clock_t(icsk->icsk_rto),
2945 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2946 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2947 		tcp_snd_cwnd(tp),
2948 		state == TCP_LISTEN ?
2949 		    fastopenq->max_qlen :
2950 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2951 }
2952 
2953 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2954 			       struct seq_file *f, int i)
2955 {
2956 	long delta = tw->tw_timer.expires - jiffies;
2957 	__be32 dest, src;
2958 	__u16 destp, srcp;
2959 
2960 	dest  = tw->tw_daddr;
2961 	src   = tw->tw_rcv_saddr;
2962 	destp = ntohs(tw->tw_dport);
2963 	srcp  = ntohs(tw->tw_sport);
2964 
2965 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2966 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2967 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2968 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2969 		refcount_read(&tw->tw_refcnt), tw);
2970 }
2971 
2972 #define TMPSZ 150
2973 
2974 static int tcp4_seq_show(struct seq_file *seq, void *v)
2975 {
2976 	struct tcp_iter_state *st;
2977 	struct sock *sk = v;
2978 
2979 	seq_setwidth(seq, TMPSZ - 1);
2980 	if (v == SEQ_START_TOKEN) {
2981 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2982 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2983 			   "inode");
2984 		goto out;
2985 	}
2986 	st = seq->private;
2987 
2988 	if (sk->sk_state == TCP_TIME_WAIT)
2989 		get_timewait4_sock(v, seq, st->num);
2990 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2991 		get_openreq4(v, seq, st->num);
2992 	else
2993 		get_tcp4_sock(v, seq, st->num);
2994 out:
2995 	seq_pad(seq, '\n');
2996 	return 0;
2997 }
2998 
2999 #ifdef CONFIG_BPF_SYSCALL
3000 union bpf_tcp_iter_batch_item {
3001 	struct sock *sk;
3002 	__u64 cookie;
3003 };
3004 
3005 struct bpf_tcp_iter_state {
3006 	struct tcp_iter_state state;
3007 	unsigned int cur_sk;
3008 	unsigned int end_sk;
3009 	unsigned int max_sk;
3010 	union bpf_tcp_iter_batch_item *batch;
3011 };
3012 
3013 struct bpf_iter__tcp {
3014 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3015 	__bpf_md_ptr(struct sock_common *, sk_common);
3016 	uid_t uid __aligned(8);
3017 };
3018 
3019 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3020 			     struct sock_common *sk_common, uid_t uid)
3021 {
3022 	struct bpf_iter__tcp ctx;
3023 
3024 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3025 	ctx.meta = meta;
3026 	ctx.sk_common = sk_common;
3027 	ctx.uid = uid;
3028 	return bpf_iter_run_prog(prog, &ctx);
3029 }
3030 
3031 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3032 {
3033 	union bpf_tcp_iter_batch_item *item;
3034 	unsigned int cur_sk = iter->cur_sk;
3035 	__u64 cookie;
3036 
3037 	/* Remember the cookies of the sockets we haven't seen yet, so we can
3038 	 * pick up where we left off next time around.
3039 	 */
3040 	while (cur_sk < iter->end_sk) {
3041 		item = &iter->batch[cur_sk++];
3042 		cookie = sock_gen_cookie(item->sk);
3043 		sock_gen_put(item->sk);
3044 		item->cookie = cookie;
3045 	}
3046 }
3047 
3048 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3049 				      unsigned int new_batch_sz, gfp_t flags)
3050 {
3051 	union bpf_tcp_iter_batch_item *new_batch;
3052 
3053 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3054 			     flags | __GFP_NOWARN);
3055 	if (!new_batch)
3056 		return -ENOMEM;
3057 
3058 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3059 	kvfree(iter->batch);
3060 	iter->batch = new_batch;
3061 	iter->max_sk = new_batch_sz;
3062 
3063 	return 0;
3064 }
3065 
3066 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3067 					       union bpf_tcp_iter_batch_item *cookies,
3068 					       int n_cookies)
3069 {
3070 	struct hlist_nulls_node *node;
3071 	struct sock *sk;
3072 	int i;
3073 
3074 	for (i = 0; i < n_cookies; i++) {
3075 		sk = first_sk;
3076 		sk_nulls_for_each_from(sk, node)
3077 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3078 				return sk;
3079 	}
3080 
3081 	return NULL;
3082 }
3083 
3084 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3085 {
3086 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3087 	struct bpf_tcp_iter_state *iter = seq->private;
3088 	struct tcp_iter_state *st = &iter->state;
3089 	unsigned int find_cookie = iter->cur_sk;
3090 	unsigned int end_cookie = iter->end_sk;
3091 	int resume_bucket = st->bucket;
3092 	struct sock *sk;
3093 
3094 	if (end_cookie && find_cookie == end_cookie)
3095 		++st->bucket;
3096 
3097 	sk = listening_get_first(seq);
3098 	iter->cur_sk = 0;
3099 	iter->end_sk = 0;
3100 
3101 	if (sk && st->bucket == resume_bucket && end_cookie) {
3102 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3103 						end_cookie - find_cookie);
3104 		if (!sk) {
3105 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
3106 			++st->bucket;
3107 			sk = listening_get_first(seq);
3108 		}
3109 	}
3110 
3111 	return sk;
3112 }
3113 
3114 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3115 {
3116 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3117 	struct bpf_tcp_iter_state *iter = seq->private;
3118 	struct tcp_iter_state *st = &iter->state;
3119 	unsigned int find_cookie = iter->cur_sk;
3120 	unsigned int end_cookie = iter->end_sk;
3121 	int resume_bucket = st->bucket;
3122 	struct sock *sk;
3123 
3124 	if (end_cookie && find_cookie == end_cookie)
3125 		++st->bucket;
3126 
3127 	sk = established_get_first(seq);
3128 	iter->cur_sk = 0;
3129 	iter->end_sk = 0;
3130 
3131 	if (sk && st->bucket == resume_bucket && end_cookie) {
3132 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3133 						end_cookie - find_cookie);
3134 		if (!sk) {
3135 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3136 			++st->bucket;
3137 			sk = established_get_first(seq);
3138 		}
3139 	}
3140 
3141 	return sk;
3142 }
3143 
3144 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3145 {
3146 	struct bpf_tcp_iter_state *iter = seq->private;
3147 	struct tcp_iter_state *st = &iter->state;
3148 	struct sock *sk = NULL;
3149 
3150 	switch (st->state) {
3151 	case TCP_SEQ_STATE_LISTENING:
3152 		sk = bpf_iter_tcp_resume_listening(seq);
3153 		if (sk)
3154 			break;
3155 		st->bucket = 0;
3156 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3157 		fallthrough;
3158 	case TCP_SEQ_STATE_ESTABLISHED:
3159 		sk = bpf_iter_tcp_resume_established(seq);
3160 		break;
3161 	}
3162 
3163 	return sk;
3164 }
3165 
3166 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3167 						 struct sock **start_sk)
3168 {
3169 	struct bpf_tcp_iter_state *iter = seq->private;
3170 	struct hlist_nulls_node *node;
3171 	unsigned int expected = 1;
3172 	struct sock *sk;
3173 
3174 	sock_hold(*start_sk);
3175 	iter->batch[iter->end_sk++].sk = *start_sk;
3176 
3177 	sk = sk_nulls_next(*start_sk);
3178 	*start_sk = NULL;
3179 	sk_nulls_for_each_from(sk, node) {
3180 		if (seq_sk_match(seq, sk)) {
3181 			if (iter->end_sk < iter->max_sk) {
3182 				sock_hold(sk);
3183 				iter->batch[iter->end_sk++].sk = sk;
3184 			} else if (!*start_sk) {
3185 				/* Remember where we left off. */
3186 				*start_sk = sk;
3187 			}
3188 			expected++;
3189 		}
3190 	}
3191 
3192 	return expected;
3193 }
3194 
3195 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3196 						   struct sock **start_sk)
3197 {
3198 	struct bpf_tcp_iter_state *iter = seq->private;
3199 	struct hlist_nulls_node *node;
3200 	unsigned int expected = 1;
3201 	struct sock *sk;
3202 
3203 	sock_hold(*start_sk);
3204 	iter->batch[iter->end_sk++].sk = *start_sk;
3205 
3206 	sk = sk_nulls_next(*start_sk);
3207 	*start_sk = NULL;
3208 	sk_nulls_for_each_from(sk, node) {
3209 		if (seq_sk_match(seq, sk)) {
3210 			if (iter->end_sk < iter->max_sk) {
3211 				sock_hold(sk);
3212 				iter->batch[iter->end_sk++].sk = sk;
3213 			} else if (!*start_sk) {
3214 				/* Remember where we left off. */
3215 				*start_sk = sk;
3216 			}
3217 			expected++;
3218 		}
3219 	}
3220 
3221 	return expected;
3222 }
3223 
3224 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3225 					struct sock **start_sk)
3226 {
3227 	struct bpf_tcp_iter_state *iter = seq->private;
3228 	struct tcp_iter_state *st = &iter->state;
3229 
3230 	if (st->state == TCP_SEQ_STATE_LISTENING)
3231 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3232 	else
3233 		return bpf_iter_tcp_established_batch(seq, start_sk);
3234 }
3235 
3236 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3237 {
3238 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3239 	struct bpf_tcp_iter_state *iter = seq->private;
3240 	struct tcp_iter_state *st = &iter->state;
3241 
3242 	if (st->state == TCP_SEQ_STATE_LISTENING)
3243 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3244 	else
3245 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3246 }
3247 
3248 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3249 {
3250 	struct bpf_tcp_iter_state *iter = seq->private;
3251 	unsigned int expected;
3252 	struct sock *sk;
3253 	int err;
3254 
3255 	sk = bpf_iter_tcp_resume(seq);
3256 	if (!sk)
3257 		return NULL; /* Done */
3258 
3259 	expected = bpf_iter_fill_batch(seq, &sk);
3260 	if (likely(iter->end_sk == expected))
3261 		goto done;
3262 
3263 	/* Batch size was too small. */
3264 	bpf_iter_tcp_unlock_bucket(seq);
3265 	bpf_iter_tcp_put_batch(iter);
3266 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3267 					 GFP_USER);
3268 	if (err)
3269 		return ERR_PTR(err);
3270 
3271 	sk = bpf_iter_tcp_resume(seq);
3272 	if (!sk)
3273 		return NULL; /* Done */
3274 
3275 	expected = bpf_iter_fill_batch(seq, &sk);
3276 	if (likely(iter->end_sk == expected))
3277 		goto done;
3278 
3279 	/* Batch size was still too small. Hold onto the lock while we try
3280 	 * again with a larger batch to make sure the current bucket's size
3281 	 * does not change in the meantime.
3282 	 */
3283 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3284 	if (err) {
3285 		bpf_iter_tcp_unlock_bucket(seq);
3286 		return ERR_PTR(err);
3287 	}
3288 
3289 	expected = bpf_iter_fill_batch(seq, &sk);
3290 	WARN_ON_ONCE(iter->end_sk != expected);
3291 done:
3292 	bpf_iter_tcp_unlock_bucket(seq);
3293 	return iter->batch[0].sk;
3294 }
3295 
3296 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3297 {
3298 	/* bpf iter does not support lseek, so it always
3299 	 * continue from where it was stop()-ped.
3300 	 */
3301 	if (*pos)
3302 		return bpf_iter_tcp_batch(seq);
3303 
3304 	return SEQ_START_TOKEN;
3305 }
3306 
3307 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3308 {
3309 	struct bpf_tcp_iter_state *iter = seq->private;
3310 	struct tcp_iter_state *st = &iter->state;
3311 	struct sock *sk;
3312 
3313 	/* Whenever seq_next() is called, the iter->cur_sk is
3314 	 * done with seq_show(), so advance to the next sk in
3315 	 * the batch.
3316 	 */
3317 	if (iter->cur_sk < iter->end_sk) {
3318 		/* Keeping st->num consistent in tcp_iter_state.
3319 		 * bpf_iter_tcp does not use st->num.
3320 		 * meta.seq_num is used instead.
3321 		 */
3322 		st->num++;
3323 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3324 	}
3325 
3326 	if (iter->cur_sk < iter->end_sk)
3327 		sk = iter->batch[iter->cur_sk].sk;
3328 	else
3329 		sk = bpf_iter_tcp_batch(seq);
3330 
3331 	++*pos;
3332 	/* Keeping st->last_pos consistent in tcp_iter_state.
3333 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3334 	 */
3335 	st->last_pos = *pos;
3336 	return sk;
3337 }
3338 
3339 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3340 {
3341 	struct bpf_iter_meta meta;
3342 	struct bpf_prog *prog;
3343 	struct sock *sk = v;
3344 	uid_t uid;
3345 	int ret;
3346 
3347 	if (v == SEQ_START_TOKEN)
3348 		return 0;
3349 
3350 	if (sk_fullsock(sk))
3351 		lock_sock(sk);
3352 
3353 	if (unlikely(sk_unhashed(sk))) {
3354 		ret = SEQ_SKIP;
3355 		goto unlock;
3356 	}
3357 
3358 	if (sk->sk_state == TCP_TIME_WAIT) {
3359 		uid = 0;
3360 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3361 		const struct request_sock *req = v;
3362 
3363 		uid = from_kuid_munged(seq_user_ns(seq),
3364 				       sk_uid(req->rsk_listener));
3365 	} else {
3366 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3367 	}
3368 
3369 	meta.seq = seq;
3370 	prog = bpf_iter_get_info(&meta, false);
3371 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3372 
3373 unlock:
3374 	if (sk_fullsock(sk))
3375 		release_sock(sk);
3376 	return ret;
3377 
3378 }
3379 
3380 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3381 {
3382 	struct bpf_tcp_iter_state *iter = seq->private;
3383 	struct bpf_iter_meta meta;
3384 	struct bpf_prog *prog;
3385 
3386 	if (!v) {
3387 		meta.seq = seq;
3388 		prog = bpf_iter_get_info(&meta, true);
3389 		if (prog)
3390 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3391 	}
3392 
3393 	if (iter->cur_sk < iter->end_sk)
3394 		bpf_iter_tcp_put_batch(iter);
3395 }
3396 
3397 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3398 	.show		= bpf_iter_tcp_seq_show,
3399 	.start		= bpf_iter_tcp_seq_start,
3400 	.next		= bpf_iter_tcp_seq_next,
3401 	.stop		= bpf_iter_tcp_seq_stop,
3402 };
3403 #endif
3404 static unsigned short seq_file_family(const struct seq_file *seq)
3405 {
3406 	const struct tcp_seq_afinfo *afinfo;
3407 
3408 #ifdef CONFIG_BPF_SYSCALL
3409 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3410 	if (seq->op == &bpf_iter_tcp_seq_ops)
3411 		return AF_UNSPEC;
3412 #endif
3413 
3414 	/* Iterated from proc fs */
3415 	afinfo = pde_data(file_inode(seq->file));
3416 	return afinfo->family;
3417 }
3418 
3419 static const struct seq_operations tcp4_seq_ops = {
3420 	.show		= tcp4_seq_show,
3421 	.start		= tcp_seq_start,
3422 	.next		= tcp_seq_next,
3423 	.stop		= tcp_seq_stop,
3424 };
3425 
3426 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3427 	.family		= AF_INET,
3428 };
3429 
3430 static int __net_init tcp4_proc_init_net(struct net *net)
3431 {
3432 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3433 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3434 		return -ENOMEM;
3435 	return 0;
3436 }
3437 
3438 static void __net_exit tcp4_proc_exit_net(struct net *net)
3439 {
3440 	remove_proc_entry("tcp", net->proc_net);
3441 }
3442 
3443 static struct pernet_operations tcp4_net_ops = {
3444 	.init = tcp4_proc_init_net,
3445 	.exit = tcp4_proc_exit_net,
3446 };
3447 
3448 int __init tcp4_proc_init(void)
3449 {
3450 	return register_pernet_subsys(&tcp4_net_ops);
3451 }
3452 
3453 void tcp4_proc_exit(void)
3454 {
3455 	unregister_pernet_subsys(&tcp4_net_ops);
3456 }
3457 #endif /* CONFIG_PROC_FS */
3458 
3459 /* @wake is one when sk_stream_write_space() calls us.
3460  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3461  * This mimics the strategy used in sock_def_write_space().
3462  */
3463 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3464 {
3465 	const struct tcp_sock *tp = tcp_sk(sk);
3466 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3467 			    READ_ONCE(tp->snd_nxt);
3468 
3469 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3470 }
3471 EXPORT_SYMBOL(tcp_stream_memory_free);
3472 
3473 struct proto tcp_prot = {
3474 	.name			= "TCP",
3475 	.owner			= THIS_MODULE,
3476 	.close			= tcp_close,
3477 	.pre_connect		= tcp_v4_pre_connect,
3478 	.connect		= tcp_v4_connect,
3479 	.disconnect		= tcp_disconnect,
3480 	.accept			= inet_csk_accept,
3481 	.ioctl			= tcp_ioctl,
3482 	.init			= tcp_v4_init_sock,
3483 	.destroy		= tcp_v4_destroy_sock,
3484 	.shutdown		= tcp_shutdown,
3485 	.setsockopt		= tcp_setsockopt,
3486 	.getsockopt		= tcp_getsockopt,
3487 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3488 	.keepalive		= tcp_set_keepalive,
3489 	.recvmsg		= tcp_recvmsg,
3490 	.sendmsg		= tcp_sendmsg,
3491 	.splice_eof		= tcp_splice_eof,
3492 	.backlog_rcv		= tcp_v4_do_rcv,
3493 	.release_cb		= tcp_release_cb,
3494 	.hash			= inet_hash,
3495 	.unhash			= inet_unhash,
3496 	.get_port		= inet_csk_get_port,
3497 	.put_port		= inet_put_port,
3498 #ifdef CONFIG_BPF_SYSCALL
3499 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3500 #endif
3501 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3502 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3503 	.stream_memory_free	= tcp_stream_memory_free,
3504 	.sockets_allocated	= &tcp_sockets_allocated,
3505 
3506 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3507 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3508 
3509 	.memory_pressure	= &tcp_memory_pressure,
3510 	.sysctl_mem		= sysctl_tcp_mem,
3511 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3512 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3513 	.max_header		= MAX_TCP_HEADER,
3514 	.obj_size		= sizeof(struct tcp_sock),
3515 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3516 	.twsk_prot		= &tcp_timewait_sock_ops,
3517 	.rsk_prot		= &tcp_request_sock_ops,
3518 	.h.hashinfo		= NULL,
3519 	.no_autobind		= true,
3520 	.diag_destroy		= tcp_abort,
3521 };
3522 EXPORT_SYMBOL(tcp_prot);
3523 
3524 static void __net_exit tcp_sk_exit(struct net *net)
3525 {
3526 	if (net->ipv4.tcp_congestion_control)
3527 		bpf_module_put(net->ipv4.tcp_congestion_control,
3528 			       net->ipv4.tcp_congestion_control->owner);
3529 }
3530 
3531 static void __net_init tcp_set_hashinfo(struct net *net)
3532 {
3533 	struct inet_hashinfo *hinfo;
3534 	unsigned int ehash_entries;
3535 	struct net *old_net;
3536 
3537 	if (net_eq(net, &init_net))
3538 		goto fallback;
3539 
3540 	old_net = current->nsproxy->net_ns;
3541 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3542 	if (!ehash_entries)
3543 		goto fallback;
3544 
3545 	ehash_entries = roundup_pow_of_two(ehash_entries);
3546 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3547 	if (!hinfo) {
3548 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3549 			"for a netns, fallback to the global one\n",
3550 			ehash_entries);
3551 fallback:
3552 		hinfo = &tcp_hashinfo;
3553 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3554 	}
3555 
3556 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3557 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3558 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3559 }
3560 
3561 static int __net_init tcp_sk_init(struct net *net)
3562 {
3563 	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3564 	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3565 	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3566 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3567 
3568 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3569 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3570 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3571 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3572 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3573 
3574 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3575 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3576 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3577 
3578 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3579 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3580 	net->ipv4.sysctl_tcp_syncookies = 1;
3581 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3582 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3583 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3584 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3585 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3586 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3587 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3588 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3589 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3590 
3591 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3592 	tcp_set_hashinfo(net);
3593 
3594 	net->ipv4.sysctl_tcp_sack = 1;
3595 	net->ipv4.sysctl_tcp_window_scaling = 1;
3596 	net->ipv4.sysctl_tcp_timestamps = 1;
3597 	net->ipv4.sysctl_tcp_early_retrans = 3;
3598 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3599 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3600 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3601 	net->ipv4.sysctl_tcp_max_reordering = 300;
3602 	net->ipv4.sysctl_tcp_dsack = 1;
3603 	net->ipv4.sysctl_tcp_app_win = 31;
3604 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3605 	net->ipv4.sysctl_tcp_frto = 2;
3606 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3607 	/* This limits the percentage of the congestion window which we
3608 	 * will allow a single TSO frame to consume.  Building TSO frames
3609 	 * which are too large can cause TCP streams to be bursty.
3610 	 */
3611 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3612 	/* Default TSQ limit of 4 MB */
3613 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3614 
3615 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3616 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3617 
3618 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3619 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3620 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3621 	net->ipv4.sysctl_tcp_autocorking = 1;
3622 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3623 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3624 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3625 	if (net != &init_net) {
3626 		memcpy(net->ipv4.sysctl_tcp_rmem,
3627 		       init_net.ipv4.sysctl_tcp_rmem,
3628 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3629 		memcpy(net->ipv4.sysctl_tcp_wmem,
3630 		       init_net.ipv4.sysctl_tcp_wmem,
3631 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3632 	}
3633 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3634 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3635 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3636 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3637 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3638 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3639 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3640 
3641 	/* Set default values for PLB */
3642 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3643 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3644 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3645 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3646 	/* Default congestion threshold for PLB to mark a round is 50% */
3647 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3648 
3649 	/* Reno is always built in */
3650 	if (!net_eq(net, &init_net) &&
3651 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3652 			       init_net.ipv4.tcp_congestion_control->owner))
3653 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3654 	else
3655 		net->ipv4.tcp_congestion_control = &tcp_reno;
3656 
3657 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3658 	net->ipv4.sysctl_tcp_shrink_window = 0;
3659 
3660 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3661 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3662 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3663 
3664 	return 0;
3665 }
3666 
3667 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3668 {
3669 	struct net *net;
3670 
3671 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3672 	 * and failed setup_net error unwinding path are serialized.
3673 	 *
3674 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3675 	 * net_exit_list, the thread that dismantles a particular twsk must
3676 	 * do so without other thread progressing to refcount_dec_and_test() of
3677 	 * tcp_death_row.tw_refcount.
3678 	 */
3679 	mutex_lock(&tcp_exit_batch_mutex);
3680 
3681 	tcp_twsk_purge(net_exit_list);
3682 
3683 	list_for_each_entry(net, net_exit_list, exit_list) {
3684 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3685 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3686 		tcp_fastopen_ctx_destroy(net);
3687 	}
3688 
3689 	mutex_unlock(&tcp_exit_batch_mutex);
3690 }
3691 
3692 static struct pernet_operations __net_initdata tcp_sk_ops = {
3693        .init	   = tcp_sk_init,
3694        .exit	   = tcp_sk_exit,
3695        .exit_batch = tcp_sk_exit_batch,
3696 };
3697 
3698 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3699 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3700 		     struct sock_common *sk_common, uid_t uid)
3701 
3702 #define INIT_BATCH_SZ 16
3703 
3704 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3705 {
3706 	struct bpf_tcp_iter_state *iter = priv_data;
3707 	int err;
3708 
3709 	err = bpf_iter_init_seq_net(priv_data, aux);
3710 	if (err)
3711 		return err;
3712 
3713 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3714 	if (err) {
3715 		bpf_iter_fini_seq_net(priv_data);
3716 		return err;
3717 	}
3718 
3719 	return 0;
3720 }
3721 
3722 static void bpf_iter_fini_tcp(void *priv_data)
3723 {
3724 	struct bpf_tcp_iter_state *iter = priv_data;
3725 
3726 	bpf_iter_fini_seq_net(priv_data);
3727 	kvfree(iter->batch);
3728 }
3729 
3730 static const struct bpf_iter_seq_info tcp_seq_info = {
3731 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3732 	.init_seq_private	= bpf_iter_init_tcp,
3733 	.fini_seq_private	= bpf_iter_fini_tcp,
3734 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3735 };
3736 
3737 static const struct bpf_func_proto *
3738 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3739 			    const struct bpf_prog *prog)
3740 {
3741 	switch (func_id) {
3742 	case BPF_FUNC_setsockopt:
3743 		return &bpf_sk_setsockopt_proto;
3744 	case BPF_FUNC_getsockopt:
3745 		return &bpf_sk_getsockopt_proto;
3746 	default:
3747 		return NULL;
3748 	}
3749 }
3750 
3751 static struct bpf_iter_reg tcp_reg_info = {
3752 	.target			= "tcp",
3753 	.ctx_arg_info_size	= 1,
3754 	.ctx_arg_info		= {
3755 		{ offsetof(struct bpf_iter__tcp, sk_common),
3756 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3757 	},
3758 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3759 	.seq_info		= &tcp_seq_info,
3760 };
3761 
3762 static void __init bpf_iter_register(void)
3763 {
3764 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3765 	if (bpf_iter_reg_target(&tcp_reg_info))
3766 		pr_warn("Warning: could not register bpf iterator tcp\n");
3767 }
3768 
3769 #endif
3770 
3771 void __init tcp_v4_init(void)
3772 {
3773 	int cpu, res;
3774 
3775 	for_each_possible_cpu(cpu) {
3776 		struct sock *sk;
3777 
3778 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3779 					   IPPROTO_TCP, &init_net);
3780 		if (res)
3781 			panic("Failed to create the TCP control socket.\n");
3782 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3783 
3784 		/* Please enforce IP_DF and IPID==0 for RST and
3785 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3786 		 */
3787 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3788 
3789 		sk->sk_clockid = CLOCK_MONOTONIC;
3790 
3791 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3792 	}
3793 	if (register_pernet_subsys(&tcp_sk_ops))
3794 		panic("Failed to create the TCP control socket.\n");
3795 
3796 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3797 	bpf_iter_register();
3798 #endif
3799 }
3800