xref: /linux/net/ipv4/tcp_ipv4.c (revision 061834624c87282c6d9d8c5395aaff4380e5e1fc)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95 
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 	return secure_tcp_seq(ip_hdr(skb)->daddr,
99 			      ip_hdr(skb)->saddr,
100 			      tcp_hdr(skb)->dest,
101 			      tcp_hdr(skb)->source);
102 }
103 
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108 
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	if (reuse == 2) {
117 		/* Still does not detect *everything* that goes through
118 		 * lo, since we require a loopback src or dst address
119 		 * or direct binding to 'lo' interface.
120 		 */
121 		bool loopback = false;
122 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 			loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 		if (tw->tw_family == AF_INET6) {
126 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 				loopback = true;
131 		} else
132 #endif
133 		{
134 			if (ipv4_is_loopback(tw->tw_daddr) ||
135 			    ipv4_is_loopback(tw->tw_rcv_saddr))
136 				loopback = true;
137 		}
138 		if (!loopback)
139 			reuse = 0;
140 	}
141 
142 	/* With PAWS, it is safe from the viewpoint
143 	   of data integrity. Even without PAWS it is safe provided sequence
144 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 
146 	   Actually, the idea is close to VJ's one, only timestamp cache is
147 	   held not per host, but per port pair and TW bucket is used as state
148 	   holder.
149 
150 	   If TW bucket has been already destroyed we fall back to VJ's scheme
151 	   and use initial timestamp retrieved from peer table.
152 	 */
153 	if (tcptw->tw_ts_recent_stamp &&
154 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
155 					    tcptw->tw_ts_recent_stamp)))) {
156 		/* In case of repair and re-using TIME-WAIT sockets we still
157 		 * want to be sure that it is safe as above but honor the
158 		 * sequence numbers and time stamps set as part of the repair
159 		 * process.
160 		 *
161 		 * Without this check re-using a TIME-WAIT socket with TCP
162 		 * repair would accumulate a -1 on the repair assigned
163 		 * sequence number. The first time it is reused the sequence
164 		 * is -1, the second time -2, etc. This fixes that issue
165 		 * without appearing to create any others.
166 		 */
167 		if (likely(!tp->repair)) {
168 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169 
170 			if (!seq)
171 				seq = 1;
172 			WRITE_ONCE(tp->write_seq, seq);
173 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
174 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 		}
176 		sock_hold(sktw);
177 		return 1;
178 	}
179 
180 	return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183 
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 			      int addr_len)
186 {
187 	/* This check is replicated from tcp_v4_connect() and intended to
188 	 * prevent BPF program called below from accessing bytes that are out
189 	 * of the bound specified by user in addr_len.
190 	 */
191 	if (addr_len < sizeof(struct sockaddr_in))
192 		return -EINVAL;
193 
194 	sock_owned_by_me(sk);
195 
196 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198 
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 	struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
203 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204 	__be32 daddr, nexthop, prev_sk_rcv_saddr;
205 	struct inet_sock *inet = inet_sk(sk);
206 	struct tcp_sock *tp = tcp_sk(sk);
207 	__be16 orig_sport, orig_dport;
208 	struct flowi4 *fl4;
209 	struct rtable *rt;
210 	int err;
211 	struct ip_options_rcu *inet_opt;
212 	struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
213 
214 	if (addr_len < sizeof(struct sockaddr_in))
215 		return -EINVAL;
216 
217 	if (usin->sin_family != AF_INET)
218 		return -EAFNOSUPPORT;
219 
220 	nexthop = daddr = usin->sin_addr.s_addr;
221 	inet_opt = rcu_dereference_protected(inet->inet_opt,
222 					     lockdep_sock_is_held(sk));
223 	if (inet_opt && inet_opt->opt.srr) {
224 		if (!daddr)
225 			return -EINVAL;
226 		nexthop = inet_opt->opt.faddr;
227 	}
228 
229 	orig_sport = inet->inet_sport;
230 	orig_dport = usin->sin_port;
231 	fl4 = &inet->cork.fl.u.ip4;
232 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
233 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
234 			      orig_dport, sk);
235 	if (IS_ERR(rt)) {
236 		err = PTR_ERR(rt);
237 		if (err == -ENETUNREACH)
238 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
239 		return err;
240 	}
241 
242 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243 		ip_rt_put(rt);
244 		return -ENETUNREACH;
245 	}
246 
247 	if (!inet_opt || !inet_opt->opt.srr)
248 		daddr = fl4->daddr;
249 
250 	if (!inet->inet_saddr) {
251 		if (inet_csk(sk)->icsk_bind2_hash) {
252 			prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo,
253 								     sk, sock_net(sk),
254 								     inet->inet_num);
255 			prev_sk_rcv_saddr = sk->sk_rcv_saddr;
256 		}
257 		inet->inet_saddr = fl4->saddr;
258 	}
259 
260 	sk_rcv_saddr_set(sk, inet->inet_saddr);
261 
262 	if (prev_addr_hashbucket) {
263 		err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
264 		if (err) {
265 			inet->inet_saddr = 0;
266 			sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
267 			ip_rt_put(rt);
268 			return err;
269 		}
270 	}
271 
272 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
273 		/* Reset inherited state */
274 		tp->rx_opt.ts_recent	   = 0;
275 		tp->rx_opt.ts_recent_stamp = 0;
276 		if (likely(!tp->repair))
277 			WRITE_ONCE(tp->write_seq, 0);
278 	}
279 
280 	inet->inet_dport = usin->sin_port;
281 	sk_daddr_set(sk, daddr);
282 
283 	inet_csk(sk)->icsk_ext_hdr_len = 0;
284 	if (inet_opt)
285 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
286 
287 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
288 
289 	/* Socket identity is still unknown (sport may be zero).
290 	 * However we set state to SYN-SENT and not releasing socket
291 	 * lock select source port, enter ourselves into the hash tables and
292 	 * complete initialization after this.
293 	 */
294 	tcp_set_state(sk, TCP_SYN_SENT);
295 	err = inet_hash_connect(tcp_death_row, sk);
296 	if (err)
297 		goto failure;
298 
299 	sk_set_txhash(sk);
300 
301 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
302 			       inet->inet_sport, inet->inet_dport, sk);
303 	if (IS_ERR(rt)) {
304 		err = PTR_ERR(rt);
305 		rt = NULL;
306 		goto failure;
307 	}
308 	/* OK, now commit destination to socket.  */
309 	sk->sk_gso_type = SKB_GSO_TCPV4;
310 	sk_setup_caps(sk, &rt->dst);
311 	rt = NULL;
312 
313 	if (likely(!tp->repair)) {
314 		if (!tp->write_seq)
315 			WRITE_ONCE(tp->write_seq,
316 				   secure_tcp_seq(inet->inet_saddr,
317 						  inet->inet_daddr,
318 						  inet->inet_sport,
319 						  usin->sin_port));
320 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
321 						 inet->inet_saddr,
322 						 inet->inet_daddr);
323 	}
324 
325 	inet->inet_id = prandom_u32();
326 
327 	if (tcp_fastopen_defer_connect(sk, &err))
328 		return err;
329 	if (err)
330 		goto failure;
331 
332 	err = tcp_connect(sk);
333 
334 	if (err)
335 		goto failure;
336 
337 	return 0;
338 
339 failure:
340 	/*
341 	 * This unhashes the socket and releases the local port,
342 	 * if necessary.
343 	 */
344 	tcp_set_state(sk, TCP_CLOSE);
345 	ip_rt_put(rt);
346 	sk->sk_route_caps = 0;
347 	inet->inet_dport = 0;
348 	return err;
349 }
350 EXPORT_SYMBOL(tcp_v4_connect);
351 
352 /*
353  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
354  * It can be called through tcp_release_cb() if socket was owned by user
355  * at the time tcp_v4_err() was called to handle ICMP message.
356  */
357 void tcp_v4_mtu_reduced(struct sock *sk)
358 {
359 	struct inet_sock *inet = inet_sk(sk);
360 	struct dst_entry *dst;
361 	u32 mtu;
362 
363 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
364 		return;
365 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
366 	dst = inet_csk_update_pmtu(sk, mtu);
367 	if (!dst)
368 		return;
369 
370 	/* Something is about to be wrong... Remember soft error
371 	 * for the case, if this connection will not able to recover.
372 	 */
373 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
374 		sk->sk_err_soft = EMSGSIZE;
375 
376 	mtu = dst_mtu(dst);
377 
378 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
379 	    ip_sk_accept_pmtu(sk) &&
380 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
381 		tcp_sync_mss(sk, mtu);
382 
383 		/* Resend the TCP packet because it's
384 		 * clear that the old packet has been
385 		 * dropped. This is the new "fast" path mtu
386 		 * discovery.
387 		 */
388 		tcp_simple_retransmit(sk);
389 	} /* else let the usual retransmit timer handle it */
390 }
391 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
392 
393 static void do_redirect(struct sk_buff *skb, struct sock *sk)
394 {
395 	struct dst_entry *dst = __sk_dst_check(sk, 0);
396 
397 	if (dst)
398 		dst->ops->redirect(dst, sk, skb);
399 }
400 
401 
402 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
403 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
404 {
405 	struct request_sock *req = inet_reqsk(sk);
406 	struct net *net = sock_net(sk);
407 
408 	/* ICMPs are not backlogged, hence we cannot get
409 	 * an established socket here.
410 	 */
411 	if (seq != tcp_rsk(req)->snt_isn) {
412 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
413 	} else if (abort) {
414 		/*
415 		 * Still in SYN_RECV, just remove it silently.
416 		 * There is no good way to pass the error to the newly
417 		 * created socket, and POSIX does not want network
418 		 * errors returned from accept().
419 		 */
420 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
421 		tcp_listendrop(req->rsk_listener);
422 	}
423 	reqsk_put(req);
424 }
425 EXPORT_SYMBOL(tcp_req_err);
426 
427 /* TCP-LD (RFC 6069) logic */
428 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
429 {
430 	struct inet_connection_sock *icsk = inet_csk(sk);
431 	struct tcp_sock *tp = tcp_sk(sk);
432 	struct sk_buff *skb;
433 	s32 remaining;
434 	u32 delta_us;
435 
436 	if (sock_owned_by_user(sk))
437 		return;
438 
439 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
440 	    !icsk->icsk_backoff)
441 		return;
442 
443 	skb = tcp_rtx_queue_head(sk);
444 	if (WARN_ON_ONCE(!skb))
445 		return;
446 
447 	icsk->icsk_backoff--;
448 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
449 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
450 
451 	tcp_mstamp_refresh(tp);
452 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
453 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
454 
455 	if (remaining > 0) {
456 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
457 					  remaining, TCP_RTO_MAX);
458 	} else {
459 		/* RTO revert clocked out retransmission.
460 		 * Will retransmit now.
461 		 */
462 		tcp_retransmit_timer(sk);
463 	}
464 }
465 EXPORT_SYMBOL(tcp_ld_RTO_revert);
466 
467 /*
468  * This routine is called by the ICMP module when it gets some
469  * sort of error condition.  If err < 0 then the socket should
470  * be closed and the error returned to the user.  If err > 0
471  * it's just the icmp type << 8 | icmp code.  After adjustment
472  * header points to the first 8 bytes of the tcp header.  We need
473  * to find the appropriate port.
474  *
475  * The locking strategy used here is very "optimistic". When
476  * someone else accesses the socket the ICMP is just dropped
477  * and for some paths there is no check at all.
478  * A more general error queue to queue errors for later handling
479  * is probably better.
480  *
481  */
482 
483 int tcp_v4_err(struct sk_buff *skb, u32 info)
484 {
485 	const struct iphdr *iph = (const struct iphdr *)skb->data;
486 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
487 	struct tcp_sock *tp;
488 	struct inet_sock *inet;
489 	const int type = icmp_hdr(skb)->type;
490 	const int code = icmp_hdr(skb)->code;
491 	struct sock *sk;
492 	struct request_sock *fastopen;
493 	u32 seq, snd_una;
494 	int err;
495 	struct net *net = dev_net(skb->dev);
496 
497 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
498 				       th->dest, iph->saddr, ntohs(th->source),
499 				       inet_iif(skb), 0);
500 	if (!sk) {
501 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
502 		return -ENOENT;
503 	}
504 	if (sk->sk_state == TCP_TIME_WAIT) {
505 		inet_twsk_put(inet_twsk(sk));
506 		return 0;
507 	}
508 	seq = ntohl(th->seq);
509 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
510 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
511 				     type == ICMP_TIME_EXCEEDED ||
512 				     (type == ICMP_DEST_UNREACH &&
513 				      (code == ICMP_NET_UNREACH ||
514 				       code == ICMP_HOST_UNREACH)));
515 		return 0;
516 	}
517 
518 	bh_lock_sock(sk);
519 	/* If too many ICMPs get dropped on busy
520 	 * servers this needs to be solved differently.
521 	 * We do take care of PMTU discovery (RFC1191) special case :
522 	 * we can receive locally generated ICMP messages while socket is held.
523 	 */
524 	if (sock_owned_by_user(sk)) {
525 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
526 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
527 	}
528 	if (sk->sk_state == TCP_CLOSE)
529 		goto out;
530 
531 	if (static_branch_unlikely(&ip4_min_ttl)) {
532 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
533 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
534 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
535 			goto out;
536 		}
537 	}
538 
539 	tp = tcp_sk(sk);
540 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
541 	fastopen = rcu_dereference(tp->fastopen_rsk);
542 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
543 	if (sk->sk_state != TCP_LISTEN &&
544 	    !between(seq, snd_una, tp->snd_nxt)) {
545 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
546 		goto out;
547 	}
548 
549 	switch (type) {
550 	case ICMP_REDIRECT:
551 		if (!sock_owned_by_user(sk))
552 			do_redirect(skb, sk);
553 		goto out;
554 	case ICMP_SOURCE_QUENCH:
555 		/* Just silently ignore these. */
556 		goto out;
557 	case ICMP_PARAMETERPROB:
558 		err = EPROTO;
559 		break;
560 	case ICMP_DEST_UNREACH:
561 		if (code > NR_ICMP_UNREACH)
562 			goto out;
563 
564 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
565 			/* We are not interested in TCP_LISTEN and open_requests
566 			 * (SYN-ACKs send out by Linux are always <576bytes so
567 			 * they should go through unfragmented).
568 			 */
569 			if (sk->sk_state == TCP_LISTEN)
570 				goto out;
571 
572 			WRITE_ONCE(tp->mtu_info, info);
573 			if (!sock_owned_by_user(sk)) {
574 				tcp_v4_mtu_reduced(sk);
575 			} else {
576 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
577 					sock_hold(sk);
578 			}
579 			goto out;
580 		}
581 
582 		err = icmp_err_convert[code].errno;
583 		/* check if this ICMP message allows revert of backoff.
584 		 * (see RFC 6069)
585 		 */
586 		if (!fastopen &&
587 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
588 			tcp_ld_RTO_revert(sk, seq);
589 		break;
590 	case ICMP_TIME_EXCEEDED:
591 		err = EHOSTUNREACH;
592 		break;
593 	default:
594 		goto out;
595 	}
596 
597 	switch (sk->sk_state) {
598 	case TCP_SYN_SENT:
599 	case TCP_SYN_RECV:
600 		/* Only in fast or simultaneous open. If a fast open socket is
601 		 * already accepted it is treated as a connected one below.
602 		 */
603 		if (fastopen && !fastopen->sk)
604 			break;
605 
606 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
607 
608 		if (!sock_owned_by_user(sk)) {
609 			sk->sk_err = err;
610 
611 			sk_error_report(sk);
612 
613 			tcp_done(sk);
614 		} else {
615 			sk->sk_err_soft = err;
616 		}
617 		goto out;
618 	}
619 
620 	/* If we've already connected we will keep trying
621 	 * until we time out, or the user gives up.
622 	 *
623 	 * rfc1122 4.2.3.9 allows to consider as hard errors
624 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
625 	 * but it is obsoleted by pmtu discovery).
626 	 *
627 	 * Note, that in modern internet, where routing is unreliable
628 	 * and in each dark corner broken firewalls sit, sending random
629 	 * errors ordered by their masters even this two messages finally lose
630 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
631 	 *
632 	 * Now we are in compliance with RFCs.
633 	 *							--ANK (980905)
634 	 */
635 
636 	inet = inet_sk(sk);
637 	if (!sock_owned_by_user(sk) && inet->recverr) {
638 		sk->sk_err = err;
639 		sk_error_report(sk);
640 	} else	{ /* Only an error on timeout */
641 		sk->sk_err_soft = err;
642 	}
643 
644 out:
645 	bh_unlock_sock(sk);
646 	sock_put(sk);
647 	return 0;
648 }
649 
650 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
651 {
652 	struct tcphdr *th = tcp_hdr(skb);
653 
654 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
655 	skb->csum_start = skb_transport_header(skb) - skb->head;
656 	skb->csum_offset = offsetof(struct tcphdr, check);
657 }
658 
659 /* This routine computes an IPv4 TCP checksum. */
660 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
661 {
662 	const struct inet_sock *inet = inet_sk(sk);
663 
664 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
665 }
666 EXPORT_SYMBOL(tcp_v4_send_check);
667 
668 /*
669  *	This routine will send an RST to the other tcp.
670  *
671  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
672  *		      for reset.
673  *	Answer: if a packet caused RST, it is not for a socket
674  *		existing in our system, if it is matched to a socket,
675  *		it is just duplicate segment or bug in other side's TCP.
676  *		So that we build reply only basing on parameters
677  *		arrived with segment.
678  *	Exception: precedence violation. We do not implement it in any case.
679  */
680 
681 #ifdef CONFIG_TCP_MD5SIG
682 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
683 #else
684 #define OPTION_BYTES sizeof(__be32)
685 #endif
686 
687 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
688 {
689 	const struct tcphdr *th = tcp_hdr(skb);
690 	struct {
691 		struct tcphdr th;
692 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
693 	} rep;
694 	struct ip_reply_arg arg;
695 #ifdef CONFIG_TCP_MD5SIG
696 	struct tcp_md5sig_key *key = NULL;
697 	const __u8 *hash_location = NULL;
698 	unsigned char newhash[16];
699 	int genhash;
700 	struct sock *sk1 = NULL;
701 #endif
702 	u64 transmit_time = 0;
703 	struct sock *ctl_sk;
704 	struct net *net;
705 
706 	/* Never send a reset in response to a reset. */
707 	if (th->rst)
708 		return;
709 
710 	/* If sk not NULL, it means we did a successful lookup and incoming
711 	 * route had to be correct. prequeue might have dropped our dst.
712 	 */
713 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
714 		return;
715 
716 	/* Swap the send and the receive. */
717 	memset(&rep, 0, sizeof(rep));
718 	rep.th.dest   = th->source;
719 	rep.th.source = th->dest;
720 	rep.th.doff   = sizeof(struct tcphdr) / 4;
721 	rep.th.rst    = 1;
722 
723 	if (th->ack) {
724 		rep.th.seq = th->ack_seq;
725 	} else {
726 		rep.th.ack = 1;
727 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
728 				       skb->len - (th->doff << 2));
729 	}
730 
731 	memset(&arg, 0, sizeof(arg));
732 	arg.iov[0].iov_base = (unsigned char *)&rep;
733 	arg.iov[0].iov_len  = sizeof(rep.th);
734 
735 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
736 #ifdef CONFIG_TCP_MD5SIG
737 	rcu_read_lock();
738 	hash_location = tcp_parse_md5sig_option(th);
739 	if (sk && sk_fullsock(sk)) {
740 		const union tcp_md5_addr *addr;
741 		int l3index;
742 
743 		/* sdif set, means packet ingressed via a device
744 		 * in an L3 domain and inet_iif is set to it.
745 		 */
746 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
747 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
749 	} else if (hash_location) {
750 		const union tcp_md5_addr *addr;
751 		int sdif = tcp_v4_sdif(skb);
752 		int dif = inet_iif(skb);
753 		int l3index;
754 
755 		/*
756 		 * active side is lost. Try to find listening socket through
757 		 * source port, and then find md5 key through listening socket.
758 		 * we are not loose security here:
759 		 * Incoming packet is checked with md5 hash with finding key,
760 		 * no RST generated if md5 hash doesn't match.
761 		 */
762 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
763 					     ip_hdr(skb)->saddr,
764 					     th->source, ip_hdr(skb)->daddr,
765 					     ntohs(th->source), dif, sdif);
766 		/* don't send rst if it can't find key */
767 		if (!sk1)
768 			goto out;
769 
770 		/* sdif set, means packet ingressed via a device
771 		 * in an L3 domain and dif is set to it.
772 		 */
773 		l3index = sdif ? dif : 0;
774 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
775 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
776 		if (!key)
777 			goto out;
778 
779 
780 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
781 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
782 			goto out;
783 
784 	}
785 
786 	if (key) {
787 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
788 				   (TCPOPT_NOP << 16) |
789 				   (TCPOPT_MD5SIG << 8) |
790 				   TCPOLEN_MD5SIG);
791 		/* Update length and the length the header thinks exists */
792 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
793 		rep.th.doff = arg.iov[0].iov_len / 4;
794 
795 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
796 				     key, ip_hdr(skb)->saddr,
797 				     ip_hdr(skb)->daddr, &rep.th);
798 	}
799 #endif
800 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
801 	if (rep.opt[0] == 0) {
802 		__be32 mrst = mptcp_reset_option(skb);
803 
804 		if (mrst) {
805 			rep.opt[0] = mrst;
806 			arg.iov[0].iov_len += sizeof(mrst);
807 			rep.th.doff = arg.iov[0].iov_len / 4;
808 		}
809 	}
810 
811 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
812 				      ip_hdr(skb)->saddr, /* XXX */
813 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
814 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
815 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
816 
817 	/* When socket is gone, all binding information is lost.
818 	 * routing might fail in this case. No choice here, if we choose to force
819 	 * input interface, we will misroute in case of asymmetric route.
820 	 */
821 	if (sk) {
822 		arg.bound_dev_if = sk->sk_bound_dev_if;
823 		if (sk_fullsock(sk))
824 			trace_tcp_send_reset(sk, skb);
825 	}
826 
827 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
828 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
829 
830 	arg.tos = ip_hdr(skb)->tos;
831 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
832 	local_bh_disable();
833 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
834 	sock_net_set(ctl_sk, net);
835 	if (sk) {
836 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
837 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
838 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
839 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
840 		transmit_time = tcp_transmit_time(sk);
841 		xfrm_sk_clone_policy(ctl_sk, sk);
842 	}
843 	ip_send_unicast_reply(ctl_sk,
844 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
845 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
846 			      &arg, arg.iov[0].iov_len,
847 			      transmit_time);
848 
849 	ctl_sk->sk_mark = 0;
850 	xfrm_sk_free_policy(ctl_sk);
851 	sock_net_set(ctl_sk, &init_net);
852 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
853 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
854 	local_bh_enable();
855 
856 #ifdef CONFIG_TCP_MD5SIG
857 out:
858 	rcu_read_unlock();
859 #endif
860 }
861 
862 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
863    outside socket context is ugly, certainly. What can I do?
864  */
865 
866 static void tcp_v4_send_ack(const struct sock *sk,
867 			    struct sk_buff *skb, u32 seq, u32 ack,
868 			    u32 win, u32 tsval, u32 tsecr, int oif,
869 			    struct tcp_md5sig_key *key,
870 			    int reply_flags, u8 tos)
871 {
872 	const struct tcphdr *th = tcp_hdr(skb);
873 	struct {
874 		struct tcphdr th;
875 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
876 #ifdef CONFIG_TCP_MD5SIG
877 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
878 #endif
879 			];
880 	} rep;
881 	struct net *net = sock_net(sk);
882 	struct ip_reply_arg arg;
883 	struct sock *ctl_sk;
884 	u64 transmit_time;
885 
886 	memset(&rep.th, 0, sizeof(struct tcphdr));
887 	memset(&arg, 0, sizeof(arg));
888 
889 	arg.iov[0].iov_base = (unsigned char *)&rep;
890 	arg.iov[0].iov_len  = sizeof(rep.th);
891 	if (tsecr) {
892 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
893 				   (TCPOPT_TIMESTAMP << 8) |
894 				   TCPOLEN_TIMESTAMP);
895 		rep.opt[1] = htonl(tsval);
896 		rep.opt[2] = htonl(tsecr);
897 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
898 	}
899 
900 	/* Swap the send and the receive. */
901 	rep.th.dest    = th->source;
902 	rep.th.source  = th->dest;
903 	rep.th.doff    = arg.iov[0].iov_len / 4;
904 	rep.th.seq     = htonl(seq);
905 	rep.th.ack_seq = htonl(ack);
906 	rep.th.ack     = 1;
907 	rep.th.window  = htons(win);
908 
909 #ifdef CONFIG_TCP_MD5SIG
910 	if (key) {
911 		int offset = (tsecr) ? 3 : 0;
912 
913 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
914 					  (TCPOPT_NOP << 16) |
915 					  (TCPOPT_MD5SIG << 8) |
916 					  TCPOLEN_MD5SIG);
917 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
918 		rep.th.doff = arg.iov[0].iov_len/4;
919 
920 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
921 				    key, ip_hdr(skb)->saddr,
922 				    ip_hdr(skb)->daddr, &rep.th);
923 	}
924 #endif
925 	arg.flags = reply_flags;
926 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
927 				      ip_hdr(skb)->saddr, /* XXX */
928 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
929 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
930 	if (oif)
931 		arg.bound_dev_if = oif;
932 	arg.tos = tos;
933 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
934 	local_bh_disable();
935 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
936 	sock_net_set(ctl_sk, net);
937 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
938 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
939 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
940 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
941 	transmit_time = tcp_transmit_time(sk);
942 	ip_send_unicast_reply(ctl_sk,
943 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
944 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
945 			      &arg, arg.iov[0].iov_len,
946 			      transmit_time);
947 
948 	ctl_sk->sk_mark = 0;
949 	sock_net_set(ctl_sk, &init_net);
950 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
951 	local_bh_enable();
952 }
953 
954 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
955 {
956 	struct inet_timewait_sock *tw = inet_twsk(sk);
957 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
958 
959 	tcp_v4_send_ack(sk, skb,
960 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
961 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
962 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
963 			tcptw->tw_ts_recent,
964 			tw->tw_bound_dev_if,
965 			tcp_twsk_md5_key(tcptw),
966 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
967 			tw->tw_tos
968 			);
969 
970 	inet_twsk_put(tw);
971 }
972 
973 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
974 				  struct request_sock *req)
975 {
976 	const union tcp_md5_addr *addr;
977 	int l3index;
978 
979 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
980 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
981 	 */
982 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
983 					     tcp_sk(sk)->snd_nxt;
984 
985 	/* RFC 7323 2.3
986 	 * The window field (SEG.WND) of every outgoing segment, with the
987 	 * exception of <SYN> segments, MUST be right-shifted by
988 	 * Rcv.Wind.Shift bits:
989 	 */
990 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
991 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
992 	tcp_v4_send_ack(sk, skb, seq,
993 			tcp_rsk(req)->rcv_nxt,
994 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
995 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
996 			req->ts_recent,
997 			0,
998 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
999 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1000 			ip_hdr(skb)->tos);
1001 }
1002 
1003 /*
1004  *	Send a SYN-ACK after having received a SYN.
1005  *	This still operates on a request_sock only, not on a big
1006  *	socket.
1007  */
1008 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1009 			      struct flowi *fl,
1010 			      struct request_sock *req,
1011 			      struct tcp_fastopen_cookie *foc,
1012 			      enum tcp_synack_type synack_type,
1013 			      struct sk_buff *syn_skb)
1014 {
1015 	const struct inet_request_sock *ireq = inet_rsk(req);
1016 	struct flowi4 fl4;
1017 	int err = -1;
1018 	struct sk_buff *skb;
1019 	u8 tos;
1020 
1021 	/* First, grab a route. */
1022 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1023 		return -1;
1024 
1025 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1026 
1027 	if (skb) {
1028 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1029 
1030 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1031 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1032 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1033 				inet_sk(sk)->tos;
1034 
1035 		if (!INET_ECN_is_capable(tos) &&
1036 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1037 			tos |= INET_ECN_ECT_0;
1038 
1039 		rcu_read_lock();
1040 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1041 					    ireq->ir_rmt_addr,
1042 					    rcu_dereference(ireq->ireq_opt),
1043 					    tos);
1044 		rcu_read_unlock();
1045 		err = net_xmit_eval(err);
1046 	}
1047 
1048 	return err;
1049 }
1050 
1051 /*
1052  *	IPv4 request_sock destructor.
1053  */
1054 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1055 {
1056 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1057 }
1058 
1059 #ifdef CONFIG_TCP_MD5SIG
1060 /*
1061  * RFC2385 MD5 checksumming requires a mapping of
1062  * IP address->MD5 Key.
1063  * We need to maintain these in the sk structure.
1064  */
1065 
1066 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1067 EXPORT_SYMBOL(tcp_md5_needed);
1068 
1069 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1070 {
1071 	if (!old)
1072 		return true;
1073 
1074 	/* l3index always overrides non-l3index */
1075 	if (old->l3index && new->l3index == 0)
1076 		return false;
1077 	if (old->l3index == 0 && new->l3index)
1078 		return true;
1079 
1080 	return old->prefixlen < new->prefixlen;
1081 }
1082 
1083 /* Find the Key structure for an address.  */
1084 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1085 					   const union tcp_md5_addr *addr,
1086 					   int family)
1087 {
1088 	const struct tcp_sock *tp = tcp_sk(sk);
1089 	struct tcp_md5sig_key *key;
1090 	const struct tcp_md5sig_info *md5sig;
1091 	__be32 mask;
1092 	struct tcp_md5sig_key *best_match = NULL;
1093 	bool match;
1094 
1095 	/* caller either holds rcu_read_lock() or socket lock */
1096 	md5sig = rcu_dereference_check(tp->md5sig_info,
1097 				       lockdep_sock_is_held(sk));
1098 	if (!md5sig)
1099 		return NULL;
1100 
1101 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1102 				 lockdep_sock_is_held(sk)) {
1103 		if (key->family != family)
1104 			continue;
1105 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1106 			continue;
1107 		if (family == AF_INET) {
1108 			mask = inet_make_mask(key->prefixlen);
1109 			match = (key->addr.a4.s_addr & mask) ==
1110 				(addr->a4.s_addr & mask);
1111 #if IS_ENABLED(CONFIG_IPV6)
1112 		} else if (family == AF_INET6) {
1113 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1114 						  key->prefixlen);
1115 #endif
1116 		} else {
1117 			match = false;
1118 		}
1119 
1120 		if (match && better_md5_match(best_match, key))
1121 			best_match = key;
1122 	}
1123 	return best_match;
1124 }
1125 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1126 
1127 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1128 						      const union tcp_md5_addr *addr,
1129 						      int family, u8 prefixlen,
1130 						      int l3index, u8 flags)
1131 {
1132 	const struct tcp_sock *tp = tcp_sk(sk);
1133 	struct tcp_md5sig_key *key;
1134 	unsigned int size = sizeof(struct in_addr);
1135 	const struct tcp_md5sig_info *md5sig;
1136 
1137 	/* caller either holds rcu_read_lock() or socket lock */
1138 	md5sig = rcu_dereference_check(tp->md5sig_info,
1139 				       lockdep_sock_is_held(sk));
1140 	if (!md5sig)
1141 		return NULL;
1142 #if IS_ENABLED(CONFIG_IPV6)
1143 	if (family == AF_INET6)
1144 		size = sizeof(struct in6_addr);
1145 #endif
1146 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1147 				 lockdep_sock_is_held(sk)) {
1148 		if (key->family != family)
1149 			continue;
1150 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1151 			continue;
1152 		if (key->l3index != l3index)
1153 			continue;
1154 		if (!memcmp(&key->addr, addr, size) &&
1155 		    key->prefixlen == prefixlen)
1156 			return key;
1157 	}
1158 	return NULL;
1159 }
1160 
1161 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1162 					 const struct sock *addr_sk)
1163 {
1164 	const union tcp_md5_addr *addr;
1165 	int l3index;
1166 
1167 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1168 						 addr_sk->sk_bound_dev_if);
1169 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1170 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1171 }
1172 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1173 
1174 /* This can be called on a newly created socket, from other files */
1175 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1176 		   int family, u8 prefixlen, int l3index, u8 flags,
1177 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1178 {
1179 	/* Add Key to the list */
1180 	struct tcp_md5sig_key *key;
1181 	struct tcp_sock *tp = tcp_sk(sk);
1182 	struct tcp_md5sig_info *md5sig;
1183 
1184 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1185 	if (key) {
1186 		/* Pre-existing entry - just update that one.
1187 		 * Note that the key might be used concurrently.
1188 		 * data_race() is telling kcsan that we do not care of
1189 		 * key mismatches, since changing MD5 key on live flows
1190 		 * can lead to packet drops.
1191 		 */
1192 		data_race(memcpy(key->key, newkey, newkeylen));
1193 
1194 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1195 		 * Also note that a reader could catch new key->keylen value
1196 		 * but old key->key[], this is the reason we use __GFP_ZERO
1197 		 * at sock_kmalloc() time below these lines.
1198 		 */
1199 		WRITE_ONCE(key->keylen, newkeylen);
1200 
1201 		return 0;
1202 	}
1203 
1204 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1205 					   lockdep_sock_is_held(sk));
1206 	if (!md5sig) {
1207 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1208 		if (!md5sig)
1209 			return -ENOMEM;
1210 
1211 		sk_gso_disable(sk);
1212 		INIT_HLIST_HEAD(&md5sig->head);
1213 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1214 	}
1215 
1216 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1217 	if (!key)
1218 		return -ENOMEM;
1219 	if (!tcp_alloc_md5sig_pool()) {
1220 		sock_kfree_s(sk, key, sizeof(*key));
1221 		return -ENOMEM;
1222 	}
1223 
1224 	memcpy(key->key, newkey, newkeylen);
1225 	key->keylen = newkeylen;
1226 	key->family = family;
1227 	key->prefixlen = prefixlen;
1228 	key->l3index = l3index;
1229 	key->flags = flags;
1230 	memcpy(&key->addr, addr,
1231 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1232 								 sizeof(struct in_addr));
1233 	hlist_add_head_rcu(&key->node, &md5sig->head);
1234 	return 0;
1235 }
1236 EXPORT_SYMBOL(tcp_md5_do_add);
1237 
1238 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1239 		   u8 prefixlen, int l3index, u8 flags)
1240 {
1241 	struct tcp_md5sig_key *key;
1242 
1243 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1244 	if (!key)
1245 		return -ENOENT;
1246 	hlist_del_rcu(&key->node);
1247 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1248 	kfree_rcu(key, rcu);
1249 	return 0;
1250 }
1251 EXPORT_SYMBOL(tcp_md5_do_del);
1252 
1253 static void tcp_clear_md5_list(struct sock *sk)
1254 {
1255 	struct tcp_sock *tp = tcp_sk(sk);
1256 	struct tcp_md5sig_key *key;
1257 	struct hlist_node *n;
1258 	struct tcp_md5sig_info *md5sig;
1259 
1260 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1261 
1262 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1263 		hlist_del_rcu(&key->node);
1264 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1265 		kfree_rcu(key, rcu);
1266 	}
1267 }
1268 
1269 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1270 				 sockptr_t optval, int optlen)
1271 {
1272 	struct tcp_md5sig cmd;
1273 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1274 	const union tcp_md5_addr *addr;
1275 	u8 prefixlen = 32;
1276 	int l3index = 0;
1277 	u8 flags;
1278 
1279 	if (optlen < sizeof(cmd))
1280 		return -EINVAL;
1281 
1282 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1283 		return -EFAULT;
1284 
1285 	if (sin->sin_family != AF_INET)
1286 		return -EINVAL;
1287 
1288 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1289 
1290 	if (optname == TCP_MD5SIG_EXT &&
1291 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1292 		prefixlen = cmd.tcpm_prefixlen;
1293 		if (prefixlen > 32)
1294 			return -EINVAL;
1295 	}
1296 
1297 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1298 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1299 		struct net_device *dev;
1300 
1301 		rcu_read_lock();
1302 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1303 		if (dev && netif_is_l3_master(dev))
1304 			l3index = dev->ifindex;
1305 
1306 		rcu_read_unlock();
1307 
1308 		/* ok to reference set/not set outside of rcu;
1309 		 * right now device MUST be an L3 master
1310 		 */
1311 		if (!dev || !l3index)
1312 			return -EINVAL;
1313 	}
1314 
1315 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1316 
1317 	if (!cmd.tcpm_keylen)
1318 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1319 
1320 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1321 		return -EINVAL;
1322 
1323 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1324 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1325 }
1326 
1327 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1328 				   __be32 daddr, __be32 saddr,
1329 				   const struct tcphdr *th, int nbytes)
1330 {
1331 	struct tcp4_pseudohdr *bp;
1332 	struct scatterlist sg;
1333 	struct tcphdr *_th;
1334 
1335 	bp = hp->scratch;
1336 	bp->saddr = saddr;
1337 	bp->daddr = daddr;
1338 	bp->pad = 0;
1339 	bp->protocol = IPPROTO_TCP;
1340 	bp->len = cpu_to_be16(nbytes);
1341 
1342 	_th = (struct tcphdr *)(bp + 1);
1343 	memcpy(_th, th, sizeof(*th));
1344 	_th->check = 0;
1345 
1346 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1347 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1348 				sizeof(*bp) + sizeof(*th));
1349 	return crypto_ahash_update(hp->md5_req);
1350 }
1351 
1352 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1353 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1354 {
1355 	struct tcp_md5sig_pool *hp;
1356 	struct ahash_request *req;
1357 
1358 	hp = tcp_get_md5sig_pool();
1359 	if (!hp)
1360 		goto clear_hash_noput;
1361 	req = hp->md5_req;
1362 
1363 	if (crypto_ahash_init(req))
1364 		goto clear_hash;
1365 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1366 		goto clear_hash;
1367 	if (tcp_md5_hash_key(hp, key))
1368 		goto clear_hash;
1369 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1370 	if (crypto_ahash_final(req))
1371 		goto clear_hash;
1372 
1373 	tcp_put_md5sig_pool();
1374 	return 0;
1375 
1376 clear_hash:
1377 	tcp_put_md5sig_pool();
1378 clear_hash_noput:
1379 	memset(md5_hash, 0, 16);
1380 	return 1;
1381 }
1382 
1383 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1384 			const struct sock *sk,
1385 			const struct sk_buff *skb)
1386 {
1387 	struct tcp_md5sig_pool *hp;
1388 	struct ahash_request *req;
1389 	const struct tcphdr *th = tcp_hdr(skb);
1390 	__be32 saddr, daddr;
1391 
1392 	if (sk) { /* valid for establish/request sockets */
1393 		saddr = sk->sk_rcv_saddr;
1394 		daddr = sk->sk_daddr;
1395 	} else {
1396 		const struct iphdr *iph = ip_hdr(skb);
1397 		saddr = iph->saddr;
1398 		daddr = iph->daddr;
1399 	}
1400 
1401 	hp = tcp_get_md5sig_pool();
1402 	if (!hp)
1403 		goto clear_hash_noput;
1404 	req = hp->md5_req;
1405 
1406 	if (crypto_ahash_init(req))
1407 		goto clear_hash;
1408 
1409 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1410 		goto clear_hash;
1411 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1412 		goto clear_hash;
1413 	if (tcp_md5_hash_key(hp, key))
1414 		goto clear_hash;
1415 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1416 	if (crypto_ahash_final(req))
1417 		goto clear_hash;
1418 
1419 	tcp_put_md5sig_pool();
1420 	return 0;
1421 
1422 clear_hash:
1423 	tcp_put_md5sig_pool();
1424 clear_hash_noput:
1425 	memset(md5_hash, 0, 16);
1426 	return 1;
1427 }
1428 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1429 
1430 #endif
1431 
1432 static void tcp_v4_init_req(struct request_sock *req,
1433 			    const struct sock *sk_listener,
1434 			    struct sk_buff *skb)
1435 {
1436 	struct inet_request_sock *ireq = inet_rsk(req);
1437 	struct net *net = sock_net(sk_listener);
1438 
1439 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1440 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1441 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1442 }
1443 
1444 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1445 					  struct sk_buff *skb,
1446 					  struct flowi *fl,
1447 					  struct request_sock *req)
1448 {
1449 	tcp_v4_init_req(req, sk, skb);
1450 
1451 	if (security_inet_conn_request(sk, skb, req))
1452 		return NULL;
1453 
1454 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1455 }
1456 
1457 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1458 	.family		=	PF_INET,
1459 	.obj_size	=	sizeof(struct tcp_request_sock),
1460 	.rtx_syn_ack	=	tcp_rtx_synack,
1461 	.send_ack	=	tcp_v4_reqsk_send_ack,
1462 	.destructor	=	tcp_v4_reqsk_destructor,
1463 	.send_reset	=	tcp_v4_send_reset,
1464 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1465 };
1466 
1467 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1468 	.mss_clamp	=	TCP_MSS_DEFAULT,
1469 #ifdef CONFIG_TCP_MD5SIG
1470 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1471 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1472 #endif
1473 #ifdef CONFIG_SYN_COOKIES
1474 	.cookie_init_seq =	cookie_v4_init_sequence,
1475 #endif
1476 	.route_req	=	tcp_v4_route_req,
1477 	.init_seq	=	tcp_v4_init_seq,
1478 	.init_ts_off	=	tcp_v4_init_ts_off,
1479 	.send_synack	=	tcp_v4_send_synack,
1480 };
1481 
1482 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1483 {
1484 	/* Never answer to SYNs send to broadcast or multicast */
1485 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1486 		goto drop;
1487 
1488 	return tcp_conn_request(&tcp_request_sock_ops,
1489 				&tcp_request_sock_ipv4_ops, sk, skb);
1490 
1491 drop:
1492 	tcp_listendrop(sk);
1493 	return 0;
1494 }
1495 EXPORT_SYMBOL(tcp_v4_conn_request);
1496 
1497 
1498 /*
1499  * The three way handshake has completed - we got a valid synack -
1500  * now create the new socket.
1501  */
1502 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1503 				  struct request_sock *req,
1504 				  struct dst_entry *dst,
1505 				  struct request_sock *req_unhash,
1506 				  bool *own_req)
1507 {
1508 	struct inet_request_sock *ireq;
1509 	bool found_dup_sk = false;
1510 	struct inet_sock *newinet;
1511 	struct tcp_sock *newtp;
1512 	struct sock *newsk;
1513 #ifdef CONFIG_TCP_MD5SIG
1514 	const union tcp_md5_addr *addr;
1515 	struct tcp_md5sig_key *key;
1516 	int l3index;
1517 #endif
1518 	struct ip_options_rcu *inet_opt;
1519 
1520 	if (sk_acceptq_is_full(sk))
1521 		goto exit_overflow;
1522 
1523 	newsk = tcp_create_openreq_child(sk, req, skb);
1524 	if (!newsk)
1525 		goto exit_nonewsk;
1526 
1527 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1528 	inet_sk_rx_dst_set(newsk, skb);
1529 
1530 	newtp		      = tcp_sk(newsk);
1531 	newinet		      = inet_sk(newsk);
1532 	ireq		      = inet_rsk(req);
1533 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1534 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1535 	newsk->sk_bound_dev_if = ireq->ir_iif;
1536 	newinet->inet_saddr   = ireq->ir_loc_addr;
1537 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1538 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1539 	newinet->mc_index     = inet_iif(skb);
1540 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1541 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1542 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1543 	if (inet_opt)
1544 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1545 	newinet->inet_id = prandom_u32();
1546 
1547 	/* Set ToS of the new socket based upon the value of incoming SYN.
1548 	 * ECT bits are set later in tcp_init_transfer().
1549 	 */
1550 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1551 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1552 
1553 	if (!dst) {
1554 		dst = inet_csk_route_child_sock(sk, newsk, req);
1555 		if (!dst)
1556 			goto put_and_exit;
1557 	} else {
1558 		/* syncookie case : see end of cookie_v4_check() */
1559 	}
1560 	sk_setup_caps(newsk, dst);
1561 
1562 	tcp_ca_openreq_child(newsk, dst);
1563 
1564 	tcp_sync_mss(newsk, dst_mtu(dst));
1565 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1566 
1567 	tcp_initialize_rcv_mss(newsk);
1568 
1569 #ifdef CONFIG_TCP_MD5SIG
1570 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1571 	/* Copy over the MD5 key from the original socket */
1572 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1573 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1574 	if (key) {
1575 		/*
1576 		 * We're using one, so create a matching key
1577 		 * on the newsk structure. If we fail to get
1578 		 * memory, then we end up not copying the key
1579 		 * across. Shucks.
1580 		 */
1581 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1582 			       key->key, key->keylen, GFP_ATOMIC);
1583 		sk_gso_disable(newsk);
1584 	}
1585 #endif
1586 
1587 	if (__inet_inherit_port(sk, newsk) < 0)
1588 		goto put_and_exit;
1589 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1590 				       &found_dup_sk);
1591 	if (likely(*own_req)) {
1592 		tcp_move_syn(newtp, req);
1593 		ireq->ireq_opt = NULL;
1594 	} else {
1595 		newinet->inet_opt = NULL;
1596 
1597 		if (!req_unhash && found_dup_sk) {
1598 			/* This code path should only be executed in the
1599 			 * syncookie case only
1600 			 */
1601 			bh_unlock_sock(newsk);
1602 			sock_put(newsk);
1603 			newsk = NULL;
1604 		}
1605 	}
1606 	return newsk;
1607 
1608 exit_overflow:
1609 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1610 exit_nonewsk:
1611 	dst_release(dst);
1612 exit:
1613 	tcp_listendrop(sk);
1614 	return NULL;
1615 put_and_exit:
1616 	newinet->inet_opt = NULL;
1617 	inet_csk_prepare_forced_close(newsk);
1618 	tcp_done(newsk);
1619 	goto exit;
1620 }
1621 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1622 
1623 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1624 {
1625 #ifdef CONFIG_SYN_COOKIES
1626 	const struct tcphdr *th = tcp_hdr(skb);
1627 
1628 	if (!th->syn)
1629 		sk = cookie_v4_check(sk, skb);
1630 #endif
1631 	return sk;
1632 }
1633 
1634 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1635 			 struct tcphdr *th, u32 *cookie)
1636 {
1637 	u16 mss = 0;
1638 #ifdef CONFIG_SYN_COOKIES
1639 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1640 				    &tcp_request_sock_ipv4_ops, sk, th);
1641 	if (mss) {
1642 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1643 		tcp_synq_overflow(sk);
1644 	}
1645 #endif
1646 	return mss;
1647 }
1648 
1649 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1650 							   u32));
1651 /* The socket must have it's spinlock held when we get
1652  * here, unless it is a TCP_LISTEN socket.
1653  *
1654  * We have a potential double-lock case here, so even when
1655  * doing backlog processing we use the BH locking scheme.
1656  * This is because we cannot sleep with the original spinlock
1657  * held.
1658  */
1659 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1660 {
1661 	enum skb_drop_reason reason;
1662 	struct sock *rsk;
1663 
1664 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1665 		struct dst_entry *dst;
1666 
1667 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1668 						lockdep_sock_is_held(sk));
1669 
1670 		sock_rps_save_rxhash(sk, skb);
1671 		sk_mark_napi_id(sk, skb);
1672 		if (dst) {
1673 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1674 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1675 					     dst, 0)) {
1676 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1677 				dst_release(dst);
1678 			}
1679 		}
1680 		tcp_rcv_established(sk, skb);
1681 		return 0;
1682 	}
1683 
1684 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1685 	if (tcp_checksum_complete(skb))
1686 		goto csum_err;
1687 
1688 	if (sk->sk_state == TCP_LISTEN) {
1689 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1690 
1691 		if (!nsk)
1692 			goto discard;
1693 		if (nsk != sk) {
1694 			if (tcp_child_process(sk, nsk, skb)) {
1695 				rsk = nsk;
1696 				goto reset;
1697 			}
1698 			return 0;
1699 		}
1700 	} else
1701 		sock_rps_save_rxhash(sk, skb);
1702 
1703 	if (tcp_rcv_state_process(sk, skb)) {
1704 		rsk = sk;
1705 		goto reset;
1706 	}
1707 	return 0;
1708 
1709 reset:
1710 	tcp_v4_send_reset(rsk, skb);
1711 discard:
1712 	kfree_skb_reason(skb, reason);
1713 	/* Be careful here. If this function gets more complicated and
1714 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1715 	 * might be destroyed here. This current version compiles correctly,
1716 	 * but you have been warned.
1717 	 */
1718 	return 0;
1719 
1720 csum_err:
1721 	reason = SKB_DROP_REASON_TCP_CSUM;
1722 	trace_tcp_bad_csum(skb);
1723 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1724 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1725 	goto discard;
1726 }
1727 EXPORT_SYMBOL(tcp_v4_do_rcv);
1728 
1729 int tcp_v4_early_demux(struct sk_buff *skb)
1730 {
1731 	const struct iphdr *iph;
1732 	const struct tcphdr *th;
1733 	struct sock *sk;
1734 
1735 	if (skb->pkt_type != PACKET_HOST)
1736 		return 0;
1737 
1738 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1739 		return 0;
1740 
1741 	iph = ip_hdr(skb);
1742 	th = tcp_hdr(skb);
1743 
1744 	if (th->doff < sizeof(struct tcphdr) / 4)
1745 		return 0;
1746 
1747 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1748 				       iph->saddr, th->source,
1749 				       iph->daddr, ntohs(th->dest),
1750 				       skb->skb_iif, inet_sdif(skb));
1751 	if (sk) {
1752 		skb->sk = sk;
1753 		skb->destructor = sock_edemux;
1754 		if (sk_fullsock(sk)) {
1755 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1756 
1757 			if (dst)
1758 				dst = dst_check(dst, 0);
1759 			if (dst &&
1760 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1761 				skb_dst_set_noref(skb, dst);
1762 		}
1763 	}
1764 	return 0;
1765 }
1766 
1767 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1768 		     enum skb_drop_reason *reason)
1769 {
1770 	u32 limit, tail_gso_size, tail_gso_segs;
1771 	struct skb_shared_info *shinfo;
1772 	const struct tcphdr *th;
1773 	struct tcphdr *thtail;
1774 	struct sk_buff *tail;
1775 	unsigned int hdrlen;
1776 	bool fragstolen;
1777 	u32 gso_segs;
1778 	u32 gso_size;
1779 	int delta;
1780 
1781 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1782 	 * we can fix skb->truesize to its real value to avoid future drops.
1783 	 * This is valid because skb is not yet charged to the socket.
1784 	 * It has been noticed pure SACK packets were sometimes dropped
1785 	 * (if cooked by drivers without copybreak feature).
1786 	 */
1787 	skb_condense(skb);
1788 
1789 	skb_dst_drop(skb);
1790 
1791 	if (unlikely(tcp_checksum_complete(skb))) {
1792 		bh_unlock_sock(sk);
1793 		trace_tcp_bad_csum(skb);
1794 		*reason = SKB_DROP_REASON_TCP_CSUM;
1795 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1796 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1797 		return true;
1798 	}
1799 
1800 	/* Attempt coalescing to last skb in backlog, even if we are
1801 	 * above the limits.
1802 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1803 	 */
1804 	th = (const struct tcphdr *)skb->data;
1805 	hdrlen = th->doff * 4;
1806 
1807 	tail = sk->sk_backlog.tail;
1808 	if (!tail)
1809 		goto no_coalesce;
1810 	thtail = (struct tcphdr *)tail->data;
1811 
1812 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1813 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1814 	    ((TCP_SKB_CB(tail)->tcp_flags |
1815 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1816 	    !((TCP_SKB_CB(tail)->tcp_flags &
1817 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1818 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1819 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1820 #ifdef CONFIG_TLS_DEVICE
1821 	    tail->decrypted != skb->decrypted ||
1822 #endif
1823 	    thtail->doff != th->doff ||
1824 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1825 		goto no_coalesce;
1826 
1827 	__skb_pull(skb, hdrlen);
1828 
1829 	shinfo = skb_shinfo(skb);
1830 	gso_size = shinfo->gso_size ?: skb->len;
1831 	gso_segs = shinfo->gso_segs ?: 1;
1832 
1833 	shinfo = skb_shinfo(tail);
1834 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1835 	tail_gso_segs = shinfo->gso_segs ?: 1;
1836 
1837 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1838 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1839 
1840 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1841 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1842 			thtail->window = th->window;
1843 		}
1844 
1845 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1846 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1847 		 * is not entered if we append a packet with a FIN.
1848 		 * SYN, RST, URG are not present.
1849 		 * ACK is set on both packets.
1850 		 * PSH : we do not really care in TCP stack,
1851 		 *       at least for 'GRO' packets.
1852 		 */
1853 		thtail->fin |= th->fin;
1854 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1855 
1856 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1857 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1858 			tail->tstamp = skb->tstamp;
1859 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1860 		}
1861 
1862 		/* Not as strict as GRO. We only need to carry mss max value */
1863 		shinfo->gso_size = max(gso_size, tail_gso_size);
1864 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1865 
1866 		sk->sk_backlog.len += delta;
1867 		__NET_INC_STATS(sock_net(sk),
1868 				LINUX_MIB_TCPBACKLOGCOALESCE);
1869 		kfree_skb_partial(skb, fragstolen);
1870 		return false;
1871 	}
1872 	__skb_push(skb, hdrlen);
1873 
1874 no_coalesce:
1875 	/* Only socket owner can try to collapse/prune rx queues
1876 	 * to reduce memory overhead, so add a little headroom here.
1877 	 * Few sockets backlog are possibly concurrently non empty.
1878 	 */
1879 	limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1880 
1881 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1882 		bh_unlock_sock(sk);
1883 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1884 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1885 		return true;
1886 	}
1887 	return false;
1888 }
1889 EXPORT_SYMBOL(tcp_add_backlog);
1890 
1891 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1892 {
1893 	struct tcphdr *th = (struct tcphdr *)skb->data;
1894 
1895 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1896 }
1897 EXPORT_SYMBOL(tcp_filter);
1898 
1899 static void tcp_v4_restore_cb(struct sk_buff *skb)
1900 {
1901 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1902 		sizeof(struct inet_skb_parm));
1903 }
1904 
1905 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1906 			   const struct tcphdr *th)
1907 {
1908 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1909 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1910 	 */
1911 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1912 		sizeof(struct inet_skb_parm));
1913 	barrier();
1914 
1915 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1916 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1917 				    skb->len - th->doff * 4);
1918 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1919 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1920 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1921 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1922 	TCP_SKB_CB(skb)->sacked	 = 0;
1923 	TCP_SKB_CB(skb)->has_rxtstamp =
1924 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1925 }
1926 
1927 /*
1928  *	From tcp_input.c
1929  */
1930 
1931 int tcp_v4_rcv(struct sk_buff *skb)
1932 {
1933 	struct net *net = dev_net(skb->dev);
1934 	enum skb_drop_reason drop_reason;
1935 	int sdif = inet_sdif(skb);
1936 	int dif = inet_iif(skb);
1937 	const struct iphdr *iph;
1938 	const struct tcphdr *th;
1939 	bool refcounted;
1940 	struct sock *sk;
1941 	int ret;
1942 
1943 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1944 	if (skb->pkt_type != PACKET_HOST)
1945 		goto discard_it;
1946 
1947 	/* Count it even if it's bad */
1948 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1949 
1950 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1951 		goto discard_it;
1952 
1953 	th = (const struct tcphdr *)skb->data;
1954 
1955 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1956 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1957 		goto bad_packet;
1958 	}
1959 	if (!pskb_may_pull(skb, th->doff * 4))
1960 		goto discard_it;
1961 
1962 	/* An explanation is required here, I think.
1963 	 * Packet length and doff are validated by header prediction,
1964 	 * provided case of th->doff==0 is eliminated.
1965 	 * So, we defer the checks. */
1966 
1967 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1968 		goto csum_error;
1969 
1970 	th = (const struct tcphdr *)skb->data;
1971 	iph = ip_hdr(skb);
1972 lookup:
1973 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1974 			       th->dest, sdif, &refcounted);
1975 	if (!sk)
1976 		goto no_tcp_socket;
1977 
1978 process:
1979 	if (sk->sk_state == TCP_TIME_WAIT)
1980 		goto do_time_wait;
1981 
1982 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1983 		struct request_sock *req = inet_reqsk(sk);
1984 		bool req_stolen = false;
1985 		struct sock *nsk;
1986 
1987 		sk = req->rsk_listener;
1988 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1989 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1990 		else
1991 			drop_reason = tcp_inbound_md5_hash(sk, skb,
1992 						   &iph->saddr, &iph->daddr,
1993 						   AF_INET, dif, sdif);
1994 		if (unlikely(drop_reason)) {
1995 			sk_drops_add(sk, skb);
1996 			reqsk_put(req);
1997 			goto discard_it;
1998 		}
1999 		if (tcp_checksum_complete(skb)) {
2000 			reqsk_put(req);
2001 			goto csum_error;
2002 		}
2003 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2004 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2005 			if (!nsk) {
2006 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2007 				goto lookup;
2008 			}
2009 			sk = nsk;
2010 			/* reuseport_migrate_sock() has already held one sk_refcnt
2011 			 * before returning.
2012 			 */
2013 		} else {
2014 			/* We own a reference on the listener, increase it again
2015 			 * as we might lose it too soon.
2016 			 */
2017 			sock_hold(sk);
2018 		}
2019 		refcounted = true;
2020 		nsk = NULL;
2021 		if (!tcp_filter(sk, skb)) {
2022 			th = (const struct tcphdr *)skb->data;
2023 			iph = ip_hdr(skb);
2024 			tcp_v4_fill_cb(skb, iph, th);
2025 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2026 		} else {
2027 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2028 		}
2029 		if (!nsk) {
2030 			reqsk_put(req);
2031 			if (req_stolen) {
2032 				/* Another cpu got exclusive access to req
2033 				 * and created a full blown socket.
2034 				 * Try to feed this packet to this socket
2035 				 * instead of discarding it.
2036 				 */
2037 				tcp_v4_restore_cb(skb);
2038 				sock_put(sk);
2039 				goto lookup;
2040 			}
2041 			goto discard_and_relse;
2042 		}
2043 		nf_reset_ct(skb);
2044 		if (nsk == sk) {
2045 			reqsk_put(req);
2046 			tcp_v4_restore_cb(skb);
2047 		} else if (tcp_child_process(sk, nsk, skb)) {
2048 			tcp_v4_send_reset(nsk, skb);
2049 			goto discard_and_relse;
2050 		} else {
2051 			sock_put(sk);
2052 			return 0;
2053 		}
2054 	}
2055 
2056 	if (static_branch_unlikely(&ip4_min_ttl)) {
2057 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2058 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2059 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2060 			goto discard_and_relse;
2061 		}
2062 	}
2063 
2064 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2065 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2066 		goto discard_and_relse;
2067 	}
2068 
2069 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2070 					   &iph->daddr, AF_INET, dif, sdif);
2071 	if (drop_reason)
2072 		goto discard_and_relse;
2073 
2074 	nf_reset_ct(skb);
2075 
2076 	if (tcp_filter(sk, skb)) {
2077 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2078 		goto discard_and_relse;
2079 	}
2080 	th = (const struct tcphdr *)skb->data;
2081 	iph = ip_hdr(skb);
2082 	tcp_v4_fill_cb(skb, iph, th);
2083 
2084 	skb->dev = NULL;
2085 
2086 	if (sk->sk_state == TCP_LISTEN) {
2087 		ret = tcp_v4_do_rcv(sk, skb);
2088 		goto put_and_return;
2089 	}
2090 
2091 	sk_incoming_cpu_update(sk);
2092 
2093 	bh_lock_sock_nested(sk);
2094 	tcp_segs_in(tcp_sk(sk), skb);
2095 	ret = 0;
2096 	if (!sock_owned_by_user(sk)) {
2097 		ret = tcp_v4_do_rcv(sk, skb);
2098 	} else {
2099 		if (tcp_add_backlog(sk, skb, &drop_reason))
2100 			goto discard_and_relse;
2101 	}
2102 	bh_unlock_sock(sk);
2103 
2104 put_and_return:
2105 	if (refcounted)
2106 		sock_put(sk);
2107 
2108 	return ret;
2109 
2110 no_tcp_socket:
2111 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2112 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2113 		goto discard_it;
2114 
2115 	tcp_v4_fill_cb(skb, iph, th);
2116 
2117 	if (tcp_checksum_complete(skb)) {
2118 csum_error:
2119 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2120 		trace_tcp_bad_csum(skb);
2121 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2122 bad_packet:
2123 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2124 	} else {
2125 		tcp_v4_send_reset(NULL, skb);
2126 	}
2127 
2128 discard_it:
2129 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2130 	/* Discard frame. */
2131 	kfree_skb_reason(skb, drop_reason);
2132 	return 0;
2133 
2134 discard_and_relse:
2135 	sk_drops_add(sk, skb);
2136 	if (refcounted)
2137 		sock_put(sk);
2138 	goto discard_it;
2139 
2140 do_time_wait:
2141 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2142 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2143 		inet_twsk_put(inet_twsk(sk));
2144 		goto discard_it;
2145 	}
2146 
2147 	tcp_v4_fill_cb(skb, iph, th);
2148 
2149 	if (tcp_checksum_complete(skb)) {
2150 		inet_twsk_put(inet_twsk(sk));
2151 		goto csum_error;
2152 	}
2153 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2154 	case TCP_TW_SYN: {
2155 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2156 							&tcp_hashinfo, skb,
2157 							__tcp_hdrlen(th),
2158 							iph->saddr, th->source,
2159 							iph->daddr, th->dest,
2160 							inet_iif(skb),
2161 							sdif);
2162 		if (sk2) {
2163 			inet_twsk_deschedule_put(inet_twsk(sk));
2164 			sk = sk2;
2165 			tcp_v4_restore_cb(skb);
2166 			refcounted = false;
2167 			goto process;
2168 		}
2169 	}
2170 		/* to ACK */
2171 		fallthrough;
2172 	case TCP_TW_ACK:
2173 		tcp_v4_timewait_ack(sk, skb);
2174 		break;
2175 	case TCP_TW_RST:
2176 		tcp_v4_send_reset(sk, skb);
2177 		inet_twsk_deschedule_put(inet_twsk(sk));
2178 		goto discard_it;
2179 	case TCP_TW_SUCCESS:;
2180 	}
2181 	goto discard_it;
2182 }
2183 
2184 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2185 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2186 	.twsk_unique	= tcp_twsk_unique,
2187 	.twsk_destructor= tcp_twsk_destructor,
2188 };
2189 
2190 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2191 {
2192 	struct dst_entry *dst = skb_dst(skb);
2193 
2194 	if (dst && dst_hold_safe(dst)) {
2195 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2196 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2197 	}
2198 }
2199 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2200 
2201 const struct inet_connection_sock_af_ops ipv4_specific = {
2202 	.queue_xmit	   = ip_queue_xmit,
2203 	.send_check	   = tcp_v4_send_check,
2204 	.rebuild_header	   = inet_sk_rebuild_header,
2205 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2206 	.conn_request	   = tcp_v4_conn_request,
2207 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2208 	.net_header_len	   = sizeof(struct iphdr),
2209 	.setsockopt	   = ip_setsockopt,
2210 	.getsockopt	   = ip_getsockopt,
2211 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2212 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2213 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2214 };
2215 EXPORT_SYMBOL(ipv4_specific);
2216 
2217 #ifdef CONFIG_TCP_MD5SIG
2218 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2219 	.md5_lookup		= tcp_v4_md5_lookup,
2220 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2221 	.md5_parse		= tcp_v4_parse_md5_keys,
2222 };
2223 #endif
2224 
2225 /* NOTE: A lot of things set to zero explicitly by call to
2226  *       sk_alloc() so need not be done here.
2227  */
2228 static int tcp_v4_init_sock(struct sock *sk)
2229 {
2230 	struct inet_connection_sock *icsk = inet_csk(sk);
2231 
2232 	tcp_init_sock(sk);
2233 
2234 	icsk->icsk_af_ops = &ipv4_specific;
2235 
2236 #ifdef CONFIG_TCP_MD5SIG
2237 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2238 #endif
2239 
2240 	return 0;
2241 }
2242 
2243 void tcp_v4_destroy_sock(struct sock *sk)
2244 {
2245 	struct tcp_sock *tp = tcp_sk(sk);
2246 
2247 	trace_tcp_destroy_sock(sk);
2248 
2249 	tcp_clear_xmit_timers(sk);
2250 
2251 	tcp_cleanup_congestion_control(sk);
2252 
2253 	tcp_cleanup_ulp(sk);
2254 
2255 	/* Cleanup up the write buffer. */
2256 	tcp_write_queue_purge(sk);
2257 
2258 	/* Check if we want to disable active TFO */
2259 	tcp_fastopen_active_disable_ofo_check(sk);
2260 
2261 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2262 	skb_rbtree_purge(&tp->out_of_order_queue);
2263 
2264 #ifdef CONFIG_TCP_MD5SIG
2265 	/* Clean up the MD5 key list, if any */
2266 	if (tp->md5sig_info) {
2267 		tcp_clear_md5_list(sk);
2268 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2269 		tp->md5sig_info = NULL;
2270 	}
2271 #endif
2272 
2273 	/* Clean up a referenced TCP bind bucket. */
2274 	if (inet_csk(sk)->icsk_bind_hash)
2275 		inet_put_port(sk);
2276 
2277 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2278 
2279 	/* If socket is aborted during connect operation */
2280 	tcp_free_fastopen_req(tp);
2281 	tcp_fastopen_destroy_cipher(sk);
2282 	tcp_saved_syn_free(tp);
2283 
2284 	sk_sockets_allocated_dec(sk);
2285 }
2286 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2287 
2288 #ifdef CONFIG_PROC_FS
2289 /* Proc filesystem TCP sock list dumping. */
2290 
2291 static unsigned short seq_file_family(const struct seq_file *seq);
2292 
2293 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2294 {
2295 	unsigned short family = seq_file_family(seq);
2296 
2297 	/* AF_UNSPEC is used as a match all */
2298 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2299 		net_eq(sock_net(sk), seq_file_net(seq)));
2300 }
2301 
2302 /* Find a non empty bucket (starting from st->bucket)
2303  * and return the first sk from it.
2304  */
2305 static void *listening_get_first(struct seq_file *seq)
2306 {
2307 	struct tcp_iter_state *st = seq->private;
2308 
2309 	st->offset = 0;
2310 	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2311 		struct inet_listen_hashbucket *ilb2;
2312 		struct hlist_nulls_node *node;
2313 		struct sock *sk;
2314 
2315 		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2316 		if (hlist_nulls_empty(&ilb2->nulls_head))
2317 			continue;
2318 
2319 		spin_lock(&ilb2->lock);
2320 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2321 			if (seq_sk_match(seq, sk))
2322 				return sk;
2323 		}
2324 		spin_unlock(&ilb2->lock);
2325 	}
2326 
2327 	return NULL;
2328 }
2329 
2330 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2331  * If "cur" is the last one in the st->bucket,
2332  * call listening_get_first() to return the first sk of the next
2333  * non empty bucket.
2334  */
2335 static void *listening_get_next(struct seq_file *seq, void *cur)
2336 {
2337 	struct tcp_iter_state *st = seq->private;
2338 	struct inet_listen_hashbucket *ilb2;
2339 	struct hlist_nulls_node *node;
2340 	struct sock *sk = cur;
2341 
2342 	++st->num;
2343 	++st->offset;
2344 
2345 	sk = sk_nulls_next(sk);
2346 	sk_nulls_for_each_from(sk, node) {
2347 		if (seq_sk_match(seq, sk))
2348 			return sk;
2349 	}
2350 
2351 	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2352 	spin_unlock(&ilb2->lock);
2353 	++st->bucket;
2354 	return listening_get_first(seq);
2355 }
2356 
2357 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2358 {
2359 	struct tcp_iter_state *st = seq->private;
2360 	void *rc;
2361 
2362 	st->bucket = 0;
2363 	st->offset = 0;
2364 	rc = listening_get_first(seq);
2365 
2366 	while (rc && *pos) {
2367 		rc = listening_get_next(seq, rc);
2368 		--*pos;
2369 	}
2370 	return rc;
2371 }
2372 
2373 static inline bool empty_bucket(const struct tcp_iter_state *st)
2374 {
2375 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2376 }
2377 
2378 /*
2379  * Get first established socket starting from bucket given in st->bucket.
2380  * If st->bucket is zero, the very first socket in the hash is returned.
2381  */
2382 static void *established_get_first(struct seq_file *seq)
2383 {
2384 	struct tcp_iter_state *st = seq->private;
2385 
2386 	st->offset = 0;
2387 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2388 		struct sock *sk;
2389 		struct hlist_nulls_node *node;
2390 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2391 
2392 		/* Lockless fast path for the common case of empty buckets */
2393 		if (empty_bucket(st))
2394 			continue;
2395 
2396 		spin_lock_bh(lock);
2397 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2398 			if (seq_sk_match(seq, sk))
2399 				return sk;
2400 		}
2401 		spin_unlock_bh(lock);
2402 	}
2403 
2404 	return NULL;
2405 }
2406 
2407 static void *established_get_next(struct seq_file *seq, void *cur)
2408 {
2409 	struct sock *sk = cur;
2410 	struct hlist_nulls_node *node;
2411 	struct tcp_iter_state *st = seq->private;
2412 
2413 	++st->num;
2414 	++st->offset;
2415 
2416 	sk = sk_nulls_next(sk);
2417 
2418 	sk_nulls_for_each_from(sk, node) {
2419 		if (seq_sk_match(seq, sk))
2420 			return sk;
2421 	}
2422 
2423 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2424 	++st->bucket;
2425 	return established_get_first(seq);
2426 }
2427 
2428 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2429 {
2430 	struct tcp_iter_state *st = seq->private;
2431 	void *rc;
2432 
2433 	st->bucket = 0;
2434 	rc = established_get_first(seq);
2435 
2436 	while (rc && pos) {
2437 		rc = established_get_next(seq, rc);
2438 		--pos;
2439 	}
2440 	return rc;
2441 }
2442 
2443 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2444 {
2445 	void *rc;
2446 	struct tcp_iter_state *st = seq->private;
2447 
2448 	st->state = TCP_SEQ_STATE_LISTENING;
2449 	rc	  = listening_get_idx(seq, &pos);
2450 
2451 	if (!rc) {
2452 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2453 		rc	  = established_get_idx(seq, pos);
2454 	}
2455 
2456 	return rc;
2457 }
2458 
2459 static void *tcp_seek_last_pos(struct seq_file *seq)
2460 {
2461 	struct tcp_iter_state *st = seq->private;
2462 	int bucket = st->bucket;
2463 	int offset = st->offset;
2464 	int orig_num = st->num;
2465 	void *rc = NULL;
2466 
2467 	switch (st->state) {
2468 	case TCP_SEQ_STATE_LISTENING:
2469 		if (st->bucket > tcp_hashinfo.lhash2_mask)
2470 			break;
2471 		st->state = TCP_SEQ_STATE_LISTENING;
2472 		rc = listening_get_first(seq);
2473 		while (offset-- && rc && bucket == st->bucket)
2474 			rc = listening_get_next(seq, rc);
2475 		if (rc)
2476 			break;
2477 		st->bucket = 0;
2478 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2479 		fallthrough;
2480 	case TCP_SEQ_STATE_ESTABLISHED:
2481 		if (st->bucket > tcp_hashinfo.ehash_mask)
2482 			break;
2483 		rc = established_get_first(seq);
2484 		while (offset-- && rc && bucket == st->bucket)
2485 			rc = established_get_next(seq, rc);
2486 	}
2487 
2488 	st->num = orig_num;
2489 
2490 	return rc;
2491 }
2492 
2493 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2494 {
2495 	struct tcp_iter_state *st = seq->private;
2496 	void *rc;
2497 
2498 	if (*pos && *pos == st->last_pos) {
2499 		rc = tcp_seek_last_pos(seq);
2500 		if (rc)
2501 			goto out;
2502 	}
2503 
2504 	st->state = TCP_SEQ_STATE_LISTENING;
2505 	st->num = 0;
2506 	st->bucket = 0;
2507 	st->offset = 0;
2508 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2509 
2510 out:
2511 	st->last_pos = *pos;
2512 	return rc;
2513 }
2514 EXPORT_SYMBOL(tcp_seq_start);
2515 
2516 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2517 {
2518 	struct tcp_iter_state *st = seq->private;
2519 	void *rc = NULL;
2520 
2521 	if (v == SEQ_START_TOKEN) {
2522 		rc = tcp_get_idx(seq, 0);
2523 		goto out;
2524 	}
2525 
2526 	switch (st->state) {
2527 	case TCP_SEQ_STATE_LISTENING:
2528 		rc = listening_get_next(seq, v);
2529 		if (!rc) {
2530 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2531 			st->bucket = 0;
2532 			st->offset = 0;
2533 			rc	  = established_get_first(seq);
2534 		}
2535 		break;
2536 	case TCP_SEQ_STATE_ESTABLISHED:
2537 		rc = established_get_next(seq, v);
2538 		break;
2539 	}
2540 out:
2541 	++*pos;
2542 	st->last_pos = *pos;
2543 	return rc;
2544 }
2545 EXPORT_SYMBOL(tcp_seq_next);
2546 
2547 void tcp_seq_stop(struct seq_file *seq, void *v)
2548 {
2549 	struct tcp_iter_state *st = seq->private;
2550 
2551 	switch (st->state) {
2552 	case TCP_SEQ_STATE_LISTENING:
2553 		if (v != SEQ_START_TOKEN)
2554 			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2555 		break;
2556 	case TCP_SEQ_STATE_ESTABLISHED:
2557 		if (v)
2558 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2559 		break;
2560 	}
2561 }
2562 EXPORT_SYMBOL(tcp_seq_stop);
2563 
2564 static void get_openreq4(const struct request_sock *req,
2565 			 struct seq_file *f, int i)
2566 {
2567 	const struct inet_request_sock *ireq = inet_rsk(req);
2568 	long delta = req->rsk_timer.expires - jiffies;
2569 
2570 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2571 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2572 		i,
2573 		ireq->ir_loc_addr,
2574 		ireq->ir_num,
2575 		ireq->ir_rmt_addr,
2576 		ntohs(ireq->ir_rmt_port),
2577 		TCP_SYN_RECV,
2578 		0, 0, /* could print option size, but that is af dependent. */
2579 		1,    /* timers active (only the expire timer) */
2580 		jiffies_delta_to_clock_t(delta),
2581 		req->num_timeout,
2582 		from_kuid_munged(seq_user_ns(f),
2583 				 sock_i_uid(req->rsk_listener)),
2584 		0,  /* non standard timer */
2585 		0, /* open_requests have no inode */
2586 		0,
2587 		req);
2588 }
2589 
2590 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2591 {
2592 	int timer_active;
2593 	unsigned long timer_expires;
2594 	const struct tcp_sock *tp = tcp_sk(sk);
2595 	const struct inet_connection_sock *icsk = inet_csk(sk);
2596 	const struct inet_sock *inet = inet_sk(sk);
2597 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2598 	__be32 dest = inet->inet_daddr;
2599 	__be32 src = inet->inet_rcv_saddr;
2600 	__u16 destp = ntohs(inet->inet_dport);
2601 	__u16 srcp = ntohs(inet->inet_sport);
2602 	int rx_queue;
2603 	int state;
2604 
2605 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2606 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2607 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2608 		timer_active	= 1;
2609 		timer_expires	= icsk->icsk_timeout;
2610 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2611 		timer_active	= 4;
2612 		timer_expires	= icsk->icsk_timeout;
2613 	} else if (timer_pending(&sk->sk_timer)) {
2614 		timer_active	= 2;
2615 		timer_expires	= sk->sk_timer.expires;
2616 	} else {
2617 		timer_active	= 0;
2618 		timer_expires = jiffies;
2619 	}
2620 
2621 	state = inet_sk_state_load(sk);
2622 	if (state == TCP_LISTEN)
2623 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2624 	else
2625 		/* Because we don't lock the socket,
2626 		 * we might find a transient negative value.
2627 		 */
2628 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2629 				      READ_ONCE(tp->copied_seq), 0);
2630 
2631 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2632 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2633 		i, src, srcp, dest, destp, state,
2634 		READ_ONCE(tp->write_seq) - tp->snd_una,
2635 		rx_queue,
2636 		timer_active,
2637 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2638 		icsk->icsk_retransmits,
2639 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2640 		icsk->icsk_probes_out,
2641 		sock_i_ino(sk),
2642 		refcount_read(&sk->sk_refcnt), sk,
2643 		jiffies_to_clock_t(icsk->icsk_rto),
2644 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2645 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2646 		tcp_snd_cwnd(tp),
2647 		state == TCP_LISTEN ?
2648 		    fastopenq->max_qlen :
2649 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2650 }
2651 
2652 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2653 			       struct seq_file *f, int i)
2654 {
2655 	long delta = tw->tw_timer.expires - jiffies;
2656 	__be32 dest, src;
2657 	__u16 destp, srcp;
2658 
2659 	dest  = tw->tw_daddr;
2660 	src   = tw->tw_rcv_saddr;
2661 	destp = ntohs(tw->tw_dport);
2662 	srcp  = ntohs(tw->tw_sport);
2663 
2664 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2665 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2666 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2667 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2668 		refcount_read(&tw->tw_refcnt), tw);
2669 }
2670 
2671 #define TMPSZ 150
2672 
2673 static int tcp4_seq_show(struct seq_file *seq, void *v)
2674 {
2675 	struct tcp_iter_state *st;
2676 	struct sock *sk = v;
2677 
2678 	seq_setwidth(seq, TMPSZ - 1);
2679 	if (v == SEQ_START_TOKEN) {
2680 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2681 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2682 			   "inode");
2683 		goto out;
2684 	}
2685 	st = seq->private;
2686 
2687 	if (sk->sk_state == TCP_TIME_WAIT)
2688 		get_timewait4_sock(v, seq, st->num);
2689 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2690 		get_openreq4(v, seq, st->num);
2691 	else
2692 		get_tcp4_sock(v, seq, st->num);
2693 out:
2694 	seq_pad(seq, '\n');
2695 	return 0;
2696 }
2697 
2698 #ifdef CONFIG_BPF_SYSCALL
2699 struct bpf_tcp_iter_state {
2700 	struct tcp_iter_state state;
2701 	unsigned int cur_sk;
2702 	unsigned int end_sk;
2703 	unsigned int max_sk;
2704 	struct sock **batch;
2705 	bool st_bucket_done;
2706 };
2707 
2708 struct bpf_iter__tcp {
2709 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2710 	__bpf_md_ptr(struct sock_common *, sk_common);
2711 	uid_t uid __aligned(8);
2712 };
2713 
2714 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2715 			     struct sock_common *sk_common, uid_t uid)
2716 {
2717 	struct bpf_iter__tcp ctx;
2718 
2719 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2720 	ctx.meta = meta;
2721 	ctx.sk_common = sk_common;
2722 	ctx.uid = uid;
2723 	return bpf_iter_run_prog(prog, &ctx);
2724 }
2725 
2726 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2727 {
2728 	while (iter->cur_sk < iter->end_sk)
2729 		sock_put(iter->batch[iter->cur_sk++]);
2730 }
2731 
2732 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2733 				      unsigned int new_batch_sz)
2734 {
2735 	struct sock **new_batch;
2736 
2737 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2738 			     GFP_USER | __GFP_NOWARN);
2739 	if (!new_batch)
2740 		return -ENOMEM;
2741 
2742 	bpf_iter_tcp_put_batch(iter);
2743 	kvfree(iter->batch);
2744 	iter->batch = new_batch;
2745 	iter->max_sk = new_batch_sz;
2746 
2747 	return 0;
2748 }
2749 
2750 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2751 						 struct sock *start_sk)
2752 {
2753 	struct bpf_tcp_iter_state *iter = seq->private;
2754 	struct tcp_iter_state *st = &iter->state;
2755 	struct hlist_nulls_node *node;
2756 	unsigned int expected = 1;
2757 	struct sock *sk;
2758 
2759 	sock_hold(start_sk);
2760 	iter->batch[iter->end_sk++] = start_sk;
2761 
2762 	sk = sk_nulls_next(start_sk);
2763 	sk_nulls_for_each_from(sk, node) {
2764 		if (seq_sk_match(seq, sk)) {
2765 			if (iter->end_sk < iter->max_sk) {
2766 				sock_hold(sk);
2767 				iter->batch[iter->end_sk++] = sk;
2768 			}
2769 			expected++;
2770 		}
2771 	}
2772 	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2773 
2774 	return expected;
2775 }
2776 
2777 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2778 						   struct sock *start_sk)
2779 {
2780 	struct bpf_tcp_iter_state *iter = seq->private;
2781 	struct tcp_iter_state *st = &iter->state;
2782 	struct hlist_nulls_node *node;
2783 	unsigned int expected = 1;
2784 	struct sock *sk;
2785 
2786 	sock_hold(start_sk);
2787 	iter->batch[iter->end_sk++] = start_sk;
2788 
2789 	sk = sk_nulls_next(start_sk);
2790 	sk_nulls_for_each_from(sk, node) {
2791 		if (seq_sk_match(seq, sk)) {
2792 			if (iter->end_sk < iter->max_sk) {
2793 				sock_hold(sk);
2794 				iter->batch[iter->end_sk++] = sk;
2795 			}
2796 			expected++;
2797 		}
2798 	}
2799 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2800 
2801 	return expected;
2802 }
2803 
2804 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2805 {
2806 	struct bpf_tcp_iter_state *iter = seq->private;
2807 	struct tcp_iter_state *st = &iter->state;
2808 	unsigned int expected;
2809 	bool resized = false;
2810 	struct sock *sk;
2811 
2812 	/* The st->bucket is done.  Directly advance to the next
2813 	 * bucket instead of having the tcp_seek_last_pos() to skip
2814 	 * one by one in the current bucket and eventually find out
2815 	 * it has to advance to the next bucket.
2816 	 */
2817 	if (iter->st_bucket_done) {
2818 		st->offset = 0;
2819 		st->bucket++;
2820 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2821 		    st->bucket > tcp_hashinfo.lhash2_mask) {
2822 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2823 			st->bucket = 0;
2824 		}
2825 	}
2826 
2827 again:
2828 	/* Get a new batch */
2829 	iter->cur_sk = 0;
2830 	iter->end_sk = 0;
2831 	iter->st_bucket_done = false;
2832 
2833 	sk = tcp_seek_last_pos(seq);
2834 	if (!sk)
2835 		return NULL; /* Done */
2836 
2837 	if (st->state == TCP_SEQ_STATE_LISTENING)
2838 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2839 	else
2840 		expected = bpf_iter_tcp_established_batch(seq, sk);
2841 
2842 	if (iter->end_sk == expected) {
2843 		iter->st_bucket_done = true;
2844 		return sk;
2845 	}
2846 
2847 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2848 		resized = true;
2849 		goto again;
2850 	}
2851 
2852 	return sk;
2853 }
2854 
2855 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2856 {
2857 	/* bpf iter does not support lseek, so it always
2858 	 * continue from where it was stop()-ped.
2859 	 */
2860 	if (*pos)
2861 		return bpf_iter_tcp_batch(seq);
2862 
2863 	return SEQ_START_TOKEN;
2864 }
2865 
2866 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2867 {
2868 	struct bpf_tcp_iter_state *iter = seq->private;
2869 	struct tcp_iter_state *st = &iter->state;
2870 	struct sock *sk;
2871 
2872 	/* Whenever seq_next() is called, the iter->cur_sk is
2873 	 * done with seq_show(), so advance to the next sk in
2874 	 * the batch.
2875 	 */
2876 	if (iter->cur_sk < iter->end_sk) {
2877 		/* Keeping st->num consistent in tcp_iter_state.
2878 		 * bpf_iter_tcp does not use st->num.
2879 		 * meta.seq_num is used instead.
2880 		 */
2881 		st->num++;
2882 		/* Move st->offset to the next sk in the bucket such that
2883 		 * the future start() will resume at st->offset in
2884 		 * st->bucket.  See tcp_seek_last_pos().
2885 		 */
2886 		st->offset++;
2887 		sock_put(iter->batch[iter->cur_sk++]);
2888 	}
2889 
2890 	if (iter->cur_sk < iter->end_sk)
2891 		sk = iter->batch[iter->cur_sk];
2892 	else
2893 		sk = bpf_iter_tcp_batch(seq);
2894 
2895 	++*pos;
2896 	/* Keeping st->last_pos consistent in tcp_iter_state.
2897 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2898 	 */
2899 	st->last_pos = *pos;
2900 	return sk;
2901 }
2902 
2903 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2904 {
2905 	struct bpf_iter_meta meta;
2906 	struct bpf_prog *prog;
2907 	struct sock *sk = v;
2908 	bool slow;
2909 	uid_t uid;
2910 	int ret;
2911 
2912 	if (v == SEQ_START_TOKEN)
2913 		return 0;
2914 
2915 	if (sk_fullsock(sk))
2916 		slow = lock_sock_fast(sk);
2917 
2918 	if (unlikely(sk_unhashed(sk))) {
2919 		ret = SEQ_SKIP;
2920 		goto unlock;
2921 	}
2922 
2923 	if (sk->sk_state == TCP_TIME_WAIT) {
2924 		uid = 0;
2925 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2926 		const struct request_sock *req = v;
2927 
2928 		uid = from_kuid_munged(seq_user_ns(seq),
2929 				       sock_i_uid(req->rsk_listener));
2930 	} else {
2931 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2932 	}
2933 
2934 	meta.seq = seq;
2935 	prog = bpf_iter_get_info(&meta, false);
2936 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2937 
2938 unlock:
2939 	if (sk_fullsock(sk))
2940 		unlock_sock_fast(sk, slow);
2941 	return ret;
2942 
2943 }
2944 
2945 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2946 {
2947 	struct bpf_tcp_iter_state *iter = seq->private;
2948 	struct bpf_iter_meta meta;
2949 	struct bpf_prog *prog;
2950 
2951 	if (!v) {
2952 		meta.seq = seq;
2953 		prog = bpf_iter_get_info(&meta, true);
2954 		if (prog)
2955 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2956 	}
2957 
2958 	if (iter->cur_sk < iter->end_sk) {
2959 		bpf_iter_tcp_put_batch(iter);
2960 		iter->st_bucket_done = false;
2961 	}
2962 }
2963 
2964 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2965 	.show		= bpf_iter_tcp_seq_show,
2966 	.start		= bpf_iter_tcp_seq_start,
2967 	.next		= bpf_iter_tcp_seq_next,
2968 	.stop		= bpf_iter_tcp_seq_stop,
2969 };
2970 #endif
2971 static unsigned short seq_file_family(const struct seq_file *seq)
2972 {
2973 	const struct tcp_seq_afinfo *afinfo;
2974 
2975 #ifdef CONFIG_BPF_SYSCALL
2976 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2977 	if (seq->op == &bpf_iter_tcp_seq_ops)
2978 		return AF_UNSPEC;
2979 #endif
2980 
2981 	/* Iterated from proc fs */
2982 	afinfo = pde_data(file_inode(seq->file));
2983 	return afinfo->family;
2984 }
2985 
2986 static const struct seq_operations tcp4_seq_ops = {
2987 	.show		= tcp4_seq_show,
2988 	.start		= tcp_seq_start,
2989 	.next		= tcp_seq_next,
2990 	.stop		= tcp_seq_stop,
2991 };
2992 
2993 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2994 	.family		= AF_INET,
2995 };
2996 
2997 static int __net_init tcp4_proc_init_net(struct net *net)
2998 {
2999 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3000 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3001 		return -ENOMEM;
3002 	return 0;
3003 }
3004 
3005 static void __net_exit tcp4_proc_exit_net(struct net *net)
3006 {
3007 	remove_proc_entry("tcp", net->proc_net);
3008 }
3009 
3010 static struct pernet_operations tcp4_net_ops = {
3011 	.init = tcp4_proc_init_net,
3012 	.exit = tcp4_proc_exit_net,
3013 };
3014 
3015 int __init tcp4_proc_init(void)
3016 {
3017 	return register_pernet_subsys(&tcp4_net_ops);
3018 }
3019 
3020 void tcp4_proc_exit(void)
3021 {
3022 	unregister_pernet_subsys(&tcp4_net_ops);
3023 }
3024 #endif /* CONFIG_PROC_FS */
3025 
3026 /* @wake is one when sk_stream_write_space() calls us.
3027  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3028  * This mimics the strategy used in sock_def_write_space().
3029  */
3030 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3031 {
3032 	const struct tcp_sock *tp = tcp_sk(sk);
3033 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3034 			    READ_ONCE(tp->snd_nxt);
3035 
3036 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3037 }
3038 EXPORT_SYMBOL(tcp_stream_memory_free);
3039 
3040 struct proto tcp_prot = {
3041 	.name			= "TCP",
3042 	.owner			= THIS_MODULE,
3043 	.close			= tcp_close,
3044 	.pre_connect		= tcp_v4_pre_connect,
3045 	.connect		= tcp_v4_connect,
3046 	.disconnect		= tcp_disconnect,
3047 	.accept			= inet_csk_accept,
3048 	.ioctl			= tcp_ioctl,
3049 	.init			= tcp_v4_init_sock,
3050 	.destroy		= tcp_v4_destroy_sock,
3051 	.shutdown		= tcp_shutdown,
3052 	.setsockopt		= tcp_setsockopt,
3053 	.getsockopt		= tcp_getsockopt,
3054 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3055 	.keepalive		= tcp_set_keepalive,
3056 	.recvmsg		= tcp_recvmsg,
3057 	.sendmsg		= tcp_sendmsg,
3058 	.sendpage		= tcp_sendpage,
3059 	.backlog_rcv		= tcp_v4_do_rcv,
3060 	.release_cb		= tcp_release_cb,
3061 	.hash			= inet_hash,
3062 	.unhash			= inet_unhash,
3063 	.get_port		= inet_csk_get_port,
3064 	.put_port		= inet_put_port,
3065 #ifdef CONFIG_BPF_SYSCALL
3066 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3067 #endif
3068 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3069 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3070 	.stream_memory_free	= tcp_stream_memory_free,
3071 	.sockets_allocated	= &tcp_sockets_allocated,
3072 	.orphan_count		= &tcp_orphan_count,
3073 
3074 	.memory_allocated	= &tcp_memory_allocated,
3075 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3076 
3077 	.memory_pressure	= &tcp_memory_pressure,
3078 	.sysctl_mem		= sysctl_tcp_mem,
3079 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3080 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3081 	.max_header		= MAX_TCP_HEADER,
3082 	.obj_size		= sizeof(struct tcp_sock),
3083 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3084 	.twsk_prot		= &tcp_timewait_sock_ops,
3085 	.rsk_prot		= &tcp_request_sock_ops,
3086 	.h.hashinfo		= &tcp_hashinfo,
3087 	.no_autobind		= true,
3088 	.diag_destroy		= tcp_abort,
3089 };
3090 EXPORT_SYMBOL(tcp_prot);
3091 
3092 static void __net_exit tcp_sk_exit(struct net *net)
3093 {
3094 	struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3095 
3096 	if (net->ipv4.tcp_congestion_control)
3097 		bpf_module_put(net->ipv4.tcp_congestion_control,
3098 			       net->ipv4.tcp_congestion_control->owner);
3099 	if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3100 		kfree(tcp_death_row);
3101 }
3102 
3103 static int __net_init tcp_sk_init(struct net *net)
3104 {
3105 	int cnt;
3106 
3107 	net->ipv4.sysctl_tcp_ecn = 2;
3108 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3109 
3110 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3111 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3112 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3113 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3114 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3115 
3116 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3117 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3118 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3119 
3120 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3121 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3122 	net->ipv4.sysctl_tcp_syncookies = 1;
3123 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3124 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3125 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3126 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3127 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3128 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3129 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3130 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3131 
3132 	net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3133 	if (!net->ipv4.tcp_death_row)
3134 		return -ENOMEM;
3135 	refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3136 	cnt = tcp_hashinfo.ehash_mask + 1;
3137 	net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3138 	net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3139 
3140 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3141 	net->ipv4.sysctl_tcp_sack = 1;
3142 	net->ipv4.sysctl_tcp_window_scaling = 1;
3143 	net->ipv4.sysctl_tcp_timestamps = 1;
3144 	net->ipv4.sysctl_tcp_early_retrans = 3;
3145 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3146 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3147 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3148 	net->ipv4.sysctl_tcp_max_reordering = 300;
3149 	net->ipv4.sysctl_tcp_dsack = 1;
3150 	net->ipv4.sysctl_tcp_app_win = 31;
3151 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3152 	net->ipv4.sysctl_tcp_frto = 2;
3153 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3154 	/* This limits the percentage of the congestion window which we
3155 	 * will allow a single TSO frame to consume.  Building TSO frames
3156 	 * which are too large can cause TCP streams to be bursty.
3157 	 */
3158 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3159 	/* Default TSQ limit of 16 TSO segments */
3160 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3161 
3162 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3163 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3164 
3165 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3166 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3167 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3168 	net->ipv4.sysctl_tcp_autocorking = 1;
3169 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3170 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3171 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3172 	if (net != &init_net) {
3173 		memcpy(net->ipv4.sysctl_tcp_rmem,
3174 		       init_net.ipv4.sysctl_tcp_rmem,
3175 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3176 		memcpy(net->ipv4.sysctl_tcp_wmem,
3177 		       init_net.ipv4.sysctl_tcp_wmem,
3178 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3179 	}
3180 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3181 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3182 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3183 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3184 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3185 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3186 
3187 	/* Reno is always built in */
3188 	if (!net_eq(net, &init_net) &&
3189 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3190 			       init_net.ipv4.tcp_congestion_control->owner))
3191 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3192 	else
3193 		net->ipv4.tcp_congestion_control = &tcp_reno;
3194 
3195 	return 0;
3196 }
3197 
3198 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3199 {
3200 	struct net *net;
3201 
3202 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
3203 
3204 	list_for_each_entry(net, net_exit_list, exit_list)
3205 		tcp_fastopen_ctx_destroy(net);
3206 }
3207 
3208 static struct pernet_operations __net_initdata tcp_sk_ops = {
3209        .init	   = tcp_sk_init,
3210        .exit	   = tcp_sk_exit,
3211        .exit_batch = tcp_sk_exit_batch,
3212 };
3213 
3214 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3215 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3216 		     struct sock_common *sk_common, uid_t uid)
3217 
3218 #define INIT_BATCH_SZ 16
3219 
3220 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3221 {
3222 	struct bpf_tcp_iter_state *iter = priv_data;
3223 	int err;
3224 
3225 	err = bpf_iter_init_seq_net(priv_data, aux);
3226 	if (err)
3227 		return err;
3228 
3229 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3230 	if (err) {
3231 		bpf_iter_fini_seq_net(priv_data);
3232 		return err;
3233 	}
3234 
3235 	return 0;
3236 }
3237 
3238 static void bpf_iter_fini_tcp(void *priv_data)
3239 {
3240 	struct bpf_tcp_iter_state *iter = priv_data;
3241 
3242 	bpf_iter_fini_seq_net(priv_data);
3243 	kvfree(iter->batch);
3244 }
3245 
3246 static const struct bpf_iter_seq_info tcp_seq_info = {
3247 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3248 	.init_seq_private	= bpf_iter_init_tcp,
3249 	.fini_seq_private	= bpf_iter_fini_tcp,
3250 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3251 };
3252 
3253 static const struct bpf_func_proto *
3254 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3255 			    const struct bpf_prog *prog)
3256 {
3257 	switch (func_id) {
3258 	case BPF_FUNC_setsockopt:
3259 		return &bpf_sk_setsockopt_proto;
3260 	case BPF_FUNC_getsockopt:
3261 		return &bpf_sk_getsockopt_proto;
3262 	default:
3263 		return NULL;
3264 	}
3265 }
3266 
3267 static struct bpf_iter_reg tcp_reg_info = {
3268 	.target			= "tcp",
3269 	.ctx_arg_info_size	= 1,
3270 	.ctx_arg_info		= {
3271 		{ offsetof(struct bpf_iter__tcp, sk_common),
3272 		  PTR_TO_BTF_ID_OR_NULL },
3273 	},
3274 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3275 	.seq_info		= &tcp_seq_info,
3276 };
3277 
3278 static void __init bpf_iter_register(void)
3279 {
3280 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3281 	if (bpf_iter_reg_target(&tcp_reg_info))
3282 		pr_warn("Warning: could not register bpf iterator tcp\n");
3283 }
3284 
3285 #endif
3286 
3287 void __init tcp_v4_init(void)
3288 {
3289 	int cpu, res;
3290 
3291 	for_each_possible_cpu(cpu) {
3292 		struct sock *sk;
3293 
3294 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3295 					   IPPROTO_TCP, &init_net);
3296 		if (res)
3297 			panic("Failed to create the TCP control socket.\n");
3298 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3299 
3300 		/* Please enforce IP_DF and IPID==0 for RST and
3301 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3302 		 */
3303 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3304 
3305 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3306 	}
3307 	if (register_pernet_subsys(&tcp_sk_ops))
3308 		panic("Failed to create the TCP control socket.\n");
3309 
3310 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3311 	bpf_iter_register();
3312 #endif
3313 }
3314