xref: /linux/net/ipv4/tcp_ipv4.c (revision d819524d3144f4703f45f473fdc85ad7579ae94c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95 
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 	return secure_tcp_seq(ip_hdr(skb)->daddr,
99 			      ip_hdr(skb)->saddr,
100 			      tcp_hdr(skb)->dest,
101 			      tcp_hdr(skb)->source);
102 }
103 
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108 
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 	struct tcp_sock *tp = tcp_sk(sk);
115 
116 	if (reuse == 2) {
117 		/* Still does not detect *everything* that goes through
118 		 * lo, since we require a loopback src or dst address
119 		 * or direct binding to 'lo' interface.
120 		 */
121 		bool loopback = false;
122 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 			loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 		if (tw->tw_family == AF_INET6) {
126 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 				loopback = true;
131 		} else
132 #endif
133 		{
134 			if (ipv4_is_loopback(tw->tw_daddr) ||
135 			    ipv4_is_loopback(tw->tw_rcv_saddr))
136 				loopback = true;
137 		}
138 		if (!loopback)
139 			reuse = 0;
140 	}
141 
142 	/* With PAWS, it is safe from the viewpoint
143 	   of data integrity. Even without PAWS it is safe provided sequence
144 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145 
146 	   Actually, the idea is close to VJ's one, only timestamp cache is
147 	   held not per host, but per port pair and TW bucket is used as state
148 	   holder.
149 
150 	   If TW bucket has been already destroyed we fall back to VJ's scheme
151 	   and use initial timestamp retrieved from peer table.
152 	 */
153 	if (tcptw->tw_ts_recent_stamp &&
154 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
155 					    tcptw->tw_ts_recent_stamp)))) {
156 		/* In case of repair and re-using TIME-WAIT sockets we still
157 		 * want to be sure that it is safe as above but honor the
158 		 * sequence numbers and time stamps set as part of the repair
159 		 * process.
160 		 *
161 		 * Without this check re-using a TIME-WAIT socket with TCP
162 		 * repair would accumulate a -1 on the repair assigned
163 		 * sequence number. The first time it is reused the sequence
164 		 * is -1, the second time -2, etc. This fixes that issue
165 		 * without appearing to create any others.
166 		 */
167 		if (likely(!tp->repair)) {
168 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169 
170 			if (!seq)
171 				seq = 1;
172 			WRITE_ONCE(tp->write_seq, seq);
173 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
174 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 		}
176 		sock_hold(sktw);
177 		return 1;
178 	}
179 
180 	return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183 
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 			      int addr_len)
186 {
187 	/* This check is replicated from tcp_v4_connect() and intended to
188 	 * prevent BPF program called below from accessing bytes that are out
189 	 * of the bound specified by user in addr_len.
190 	 */
191 	if (addr_len < sizeof(struct sockaddr_in))
192 		return -EINVAL;
193 
194 	sock_owned_by_me(sk);
195 
196 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198 
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 	struct inet_sock *inet = inet_sk(sk);
204 	struct tcp_sock *tp = tcp_sk(sk);
205 	__be16 orig_sport, orig_dport;
206 	__be32 daddr, nexthop;
207 	struct flowi4 *fl4;
208 	struct rtable *rt;
209 	int err;
210 	struct ip_options_rcu *inet_opt;
211 	struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
212 
213 	if (addr_len < sizeof(struct sockaddr_in))
214 		return -EINVAL;
215 
216 	if (usin->sin_family != AF_INET)
217 		return -EAFNOSUPPORT;
218 
219 	nexthop = daddr = usin->sin_addr.s_addr;
220 	inet_opt = rcu_dereference_protected(inet->inet_opt,
221 					     lockdep_sock_is_held(sk));
222 	if (inet_opt && inet_opt->opt.srr) {
223 		if (!daddr)
224 			return -EINVAL;
225 		nexthop = inet_opt->opt.faddr;
226 	}
227 
228 	orig_sport = inet->inet_sport;
229 	orig_dport = usin->sin_port;
230 	fl4 = &inet->cork.fl.u.ip4;
231 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
233 			      orig_dport, sk);
234 	if (IS_ERR(rt)) {
235 		err = PTR_ERR(rt);
236 		if (err == -ENETUNREACH)
237 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
238 		return err;
239 	}
240 
241 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
242 		ip_rt_put(rt);
243 		return -ENETUNREACH;
244 	}
245 
246 	if (!inet_opt || !inet_opt->opt.srr)
247 		daddr = fl4->daddr;
248 
249 	if (!inet->inet_saddr)
250 		inet->inet_saddr = fl4->saddr;
251 	sk_rcv_saddr_set(sk, inet->inet_saddr);
252 
253 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254 		/* Reset inherited state */
255 		tp->rx_opt.ts_recent	   = 0;
256 		tp->rx_opt.ts_recent_stamp = 0;
257 		if (likely(!tp->repair))
258 			WRITE_ONCE(tp->write_seq, 0);
259 	}
260 
261 	inet->inet_dport = usin->sin_port;
262 	sk_daddr_set(sk, daddr);
263 
264 	inet_csk(sk)->icsk_ext_hdr_len = 0;
265 	if (inet_opt)
266 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267 
268 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269 
270 	/* Socket identity is still unknown (sport may be zero).
271 	 * However we set state to SYN-SENT and not releasing socket
272 	 * lock select source port, enter ourselves into the hash tables and
273 	 * complete initialization after this.
274 	 */
275 	tcp_set_state(sk, TCP_SYN_SENT);
276 	err = inet_hash_connect(tcp_death_row, sk);
277 	if (err)
278 		goto failure;
279 
280 	sk_set_txhash(sk);
281 
282 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 			       inet->inet_sport, inet->inet_dport, sk);
284 	if (IS_ERR(rt)) {
285 		err = PTR_ERR(rt);
286 		rt = NULL;
287 		goto failure;
288 	}
289 	/* OK, now commit destination to socket.  */
290 	sk->sk_gso_type = SKB_GSO_TCPV4;
291 	sk_setup_caps(sk, &rt->dst);
292 	rt = NULL;
293 
294 	if (likely(!tp->repair)) {
295 		if (!tp->write_seq)
296 			WRITE_ONCE(tp->write_seq,
297 				   secure_tcp_seq(inet->inet_saddr,
298 						  inet->inet_daddr,
299 						  inet->inet_sport,
300 						  usin->sin_port));
301 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
302 						 inet->inet_saddr,
303 						 inet->inet_daddr);
304 	}
305 
306 	inet->inet_id = prandom_u32();
307 
308 	if (tcp_fastopen_defer_connect(sk, &err))
309 		return err;
310 	if (err)
311 		goto failure;
312 
313 	err = tcp_connect(sk);
314 
315 	if (err)
316 		goto failure;
317 
318 	return 0;
319 
320 failure:
321 	/*
322 	 * This unhashes the socket and releases the local port,
323 	 * if necessary.
324 	 */
325 	tcp_set_state(sk, TCP_CLOSE);
326 	ip_rt_put(rt);
327 	sk->sk_route_caps = 0;
328 	inet->inet_dport = 0;
329 	return err;
330 }
331 EXPORT_SYMBOL(tcp_v4_connect);
332 
333 /*
334  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
335  * It can be called through tcp_release_cb() if socket was owned by user
336  * at the time tcp_v4_err() was called to handle ICMP message.
337  */
338 void tcp_v4_mtu_reduced(struct sock *sk)
339 {
340 	struct inet_sock *inet = inet_sk(sk);
341 	struct dst_entry *dst;
342 	u32 mtu;
343 
344 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 		return;
346 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
347 	dst = inet_csk_update_pmtu(sk, mtu);
348 	if (!dst)
349 		return;
350 
351 	/* Something is about to be wrong... Remember soft error
352 	 * for the case, if this connection will not able to recover.
353 	 */
354 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
355 		sk->sk_err_soft = EMSGSIZE;
356 
357 	mtu = dst_mtu(dst);
358 
359 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
360 	    ip_sk_accept_pmtu(sk) &&
361 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
362 		tcp_sync_mss(sk, mtu);
363 
364 		/* Resend the TCP packet because it's
365 		 * clear that the old packet has been
366 		 * dropped. This is the new "fast" path mtu
367 		 * discovery.
368 		 */
369 		tcp_simple_retransmit(sk);
370 	} /* else let the usual retransmit timer handle it */
371 }
372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373 
374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
375 {
376 	struct dst_entry *dst = __sk_dst_check(sk, 0);
377 
378 	if (dst)
379 		dst->ops->redirect(dst, sk, skb);
380 }
381 
382 
383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385 {
386 	struct request_sock *req = inet_reqsk(sk);
387 	struct net *net = sock_net(sk);
388 
389 	/* ICMPs are not backlogged, hence we cannot get
390 	 * an established socket here.
391 	 */
392 	if (seq != tcp_rsk(req)->snt_isn) {
393 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 	} else if (abort) {
395 		/*
396 		 * Still in SYN_RECV, just remove it silently.
397 		 * There is no good way to pass the error to the newly
398 		 * created socket, and POSIX does not want network
399 		 * errors returned from accept().
400 		 */
401 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
402 		tcp_listendrop(req->rsk_listener);
403 	}
404 	reqsk_put(req);
405 }
406 EXPORT_SYMBOL(tcp_req_err);
407 
408 /* TCP-LD (RFC 6069) logic */
409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410 {
411 	struct inet_connection_sock *icsk = inet_csk(sk);
412 	struct tcp_sock *tp = tcp_sk(sk);
413 	struct sk_buff *skb;
414 	s32 remaining;
415 	u32 delta_us;
416 
417 	if (sock_owned_by_user(sk))
418 		return;
419 
420 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
421 	    !icsk->icsk_backoff)
422 		return;
423 
424 	skb = tcp_rtx_queue_head(sk);
425 	if (WARN_ON_ONCE(!skb))
426 		return;
427 
428 	icsk->icsk_backoff--;
429 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
430 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431 
432 	tcp_mstamp_refresh(tp);
433 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
434 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
435 
436 	if (remaining > 0) {
437 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438 					  remaining, TCP_RTO_MAX);
439 	} else {
440 		/* RTO revert clocked out retransmission.
441 		 * Will retransmit now.
442 		 */
443 		tcp_retransmit_timer(sk);
444 	}
445 }
446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
447 
448 /*
449  * This routine is called by the ICMP module when it gets some
450  * sort of error condition.  If err < 0 then the socket should
451  * be closed and the error returned to the user.  If err > 0
452  * it's just the icmp type << 8 | icmp code.  After adjustment
453  * header points to the first 8 bytes of the tcp header.  We need
454  * to find the appropriate port.
455  *
456  * The locking strategy used here is very "optimistic". When
457  * someone else accesses the socket the ICMP is just dropped
458  * and for some paths there is no check at all.
459  * A more general error queue to queue errors for later handling
460  * is probably better.
461  *
462  */
463 
464 int tcp_v4_err(struct sk_buff *skb, u32 info)
465 {
466 	const struct iphdr *iph = (const struct iphdr *)skb->data;
467 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 	struct tcp_sock *tp;
469 	struct inet_sock *inet;
470 	const int type = icmp_hdr(skb)->type;
471 	const int code = icmp_hdr(skb)->code;
472 	struct sock *sk;
473 	struct request_sock *fastopen;
474 	u32 seq, snd_una;
475 	int err;
476 	struct net *net = dev_net(skb->dev);
477 
478 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
479 				       th->dest, iph->saddr, ntohs(th->source),
480 				       inet_iif(skb), 0);
481 	if (!sk) {
482 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
483 		return -ENOENT;
484 	}
485 	if (sk->sk_state == TCP_TIME_WAIT) {
486 		inet_twsk_put(inet_twsk(sk));
487 		return 0;
488 	}
489 	seq = ntohl(th->seq);
490 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
491 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
492 				     type == ICMP_TIME_EXCEEDED ||
493 				     (type == ICMP_DEST_UNREACH &&
494 				      (code == ICMP_NET_UNREACH ||
495 				       code == ICMP_HOST_UNREACH)));
496 		return 0;
497 	}
498 
499 	bh_lock_sock(sk);
500 	/* If too many ICMPs get dropped on busy
501 	 * servers this needs to be solved differently.
502 	 * We do take care of PMTU discovery (RFC1191) special case :
503 	 * we can receive locally generated ICMP messages while socket is held.
504 	 */
505 	if (sock_owned_by_user(sk)) {
506 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
507 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 	}
509 	if (sk->sk_state == TCP_CLOSE)
510 		goto out;
511 
512 	if (static_branch_unlikely(&ip4_min_ttl)) {
513 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
514 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
515 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
516 			goto out;
517 		}
518 	}
519 
520 	tp = tcp_sk(sk);
521 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 	fastopen = rcu_dereference(tp->fastopen_rsk);
523 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 	if (sk->sk_state != TCP_LISTEN &&
525 	    !between(seq, snd_una, tp->snd_nxt)) {
526 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 		goto out;
528 	}
529 
530 	switch (type) {
531 	case ICMP_REDIRECT:
532 		if (!sock_owned_by_user(sk))
533 			do_redirect(skb, sk);
534 		goto out;
535 	case ICMP_SOURCE_QUENCH:
536 		/* Just silently ignore these. */
537 		goto out;
538 	case ICMP_PARAMETERPROB:
539 		err = EPROTO;
540 		break;
541 	case ICMP_DEST_UNREACH:
542 		if (code > NR_ICMP_UNREACH)
543 			goto out;
544 
545 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 			/* We are not interested in TCP_LISTEN and open_requests
547 			 * (SYN-ACKs send out by Linux are always <576bytes so
548 			 * they should go through unfragmented).
549 			 */
550 			if (sk->sk_state == TCP_LISTEN)
551 				goto out;
552 
553 			WRITE_ONCE(tp->mtu_info, info);
554 			if (!sock_owned_by_user(sk)) {
555 				tcp_v4_mtu_reduced(sk);
556 			} else {
557 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 					sock_hold(sk);
559 			}
560 			goto out;
561 		}
562 
563 		err = icmp_err_convert[code].errno;
564 		/* check if this ICMP message allows revert of backoff.
565 		 * (see RFC 6069)
566 		 */
567 		if (!fastopen &&
568 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 			tcp_ld_RTO_revert(sk, seq);
570 		break;
571 	case ICMP_TIME_EXCEEDED:
572 		err = EHOSTUNREACH;
573 		break;
574 	default:
575 		goto out;
576 	}
577 
578 	switch (sk->sk_state) {
579 	case TCP_SYN_SENT:
580 	case TCP_SYN_RECV:
581 		/* Only in fast or simultaneous open. If a fast open socket is
582 		 * already accepted it is treated as a connected one below.
583 		 */
584 		if (fastopen && !fastopen->sk)
585 			break;
586 
587 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588 
589 		if (!sock_owned_by_user(sk)) {
590 			sk->sk_err = err;
591 
592 			sk_error_report(sk);
593 
594 			tcp_done(sk);
595 		} else {
596 			sk->sk_err_soft = err;
597 		}
598 		goto out;
599 	}
600 
601 	/* If we've already connected we will keep trying
602 	 * until we time out, or the user gives up.
603 	 *
604 	 * rfc1122 4.2.3.9 allows to consider as hard errors
605 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 	 * but it is obsoleted by pmtu discovery).
607 	 *
608 	 * Note, that in modern internet, where routing is unreliable
609 	 * and in each dark corner broken firewalls sit, sending random
610 	 * errors ordered by their masters even this two messages finally lose
611 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
612 	 *
613 	 * Now we are in compliance with RFCs.
614 	 *							--ANK (980905)
615 	 */
616 
617 	inet = inet_sk(sk);
618 	if (!sock_owned_by_user(sk) && inet->recverr) {
619 		sk->sk_err = err;
620 		sk_error_report(sk);
621 	} else	{ /* Only an error on timeout */
622 		sk->sk_err_soft = err;
623 	}
624 
625 out:
626 	bh_unlock_sock(sk);
627 	sock_put(sk);
628 	return 0;
629 }
630 
631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632 {
633 	struct tcphdr *th = tcp_hdr(skb);
634 
635 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 	skb->csum_start = skb_transport_header(skb) - skb->head;
637 	skb->csum_offset = offsetof(struct tcphdr, check);
638 }
639 
640 /* This routine computes an IPv4 TCP checksum. */
641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642 {
643 	const struct inet_sock *inet = inet_sk(sk);
644 
645 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646 }
647 EXPORT_SYMBOL(tcp_v4_send_check);
648 
649 /*
650  *	This routine will send an RST to the other tcp.
651  *
652  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
653  *		      for reset.
654  *	Answer: if a packet caused RST, it is not for a socket
655  *		existing in our system, if it is matched to a socket,
656  *		it is just duplicate segment or bug in other side's TCP.
657  *		So that we build reply only basing on parameters
658  *		arrived with segment.
659  *	Exception: precedence violation. We do not implement it in any case.
660  */
661 
662 #ifdef CONFIG_TCP_MD5SIG
663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
664 #else
665 #define OPTION_BYTES sizeof(__be32)
666 #endif
667 
668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669 {
670 	const struct tcphdr *th = tcp_hdr(skb);
671 	struct {
672 		struct tcphdr th;
673 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
674 	} rep;
675 	struct ip_reply_arg arg;
676 #ifdef CONFIG_TCP_MD5SIG
677 	struct tcp_md5sig_key *key = NULL;
678 	const __u8 *hash_location = NULL;
679 	unsigned char newhash[16];
680 	int genhash;
681 	struct sock *sk1 = NULL;
682 #endif
683 	u64 transmit_time = 0;
684 	struct sock *ctl_sk;
685 	struct net *net;
686 
687 	/* Never send a reset in response to a reset. */
688 	if (th->rst)
689 		return;
690 
691 	/* If sk not NULL, it means we did a successful lookup and incoming
692 	 * route had to be correct. prequeue might have dropped our dst.
693 	 */
694 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
695 		return;
696 
697 	/* Swap the send and the receive. */
698 	memset(&rep, 0, sizeof(rep));
699 	rep.th.dest   = th->source;
700 	rep.th.source = th->dest;
701 	rep.th.doff   = sizeof(struct tcphdr) / 4;
702 	rep.th.rst    = 1;
703 
704 	if (th->ack) {
705 		rep.th.seq = th->ack_seq;
706 	} else {
707 		rep.th.ack = 1;
708 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 				       skb->len - (th->doff << 2));
710 	}
711 
712 	memset(&arg, 0, sizeof(arg));
713 	arg.iov[0].iov_base = (unsigned char *)&rep;
714 	arg.iov[0].iov_len  = sizeof(rep.th);
715 
716 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717 #ifdef CONFIG_TCP_MD5SIG
718 	rcu_read_lock();
719 	hash_location = tcp_parse_md5sig_option(th);
720 	if (sk && sk_fullsock(sk)) {
721 		const union tcp_md5_addr *addr;
722 		int l3index;
723 
724 		/* sdif set, means packet ingressed via a device
725 		 * in an L3 domain and inet_iif is set to it.
726 		 */
727 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 	} else if (hash_location) {
731 		const union tcp_md5_addr *addr;
732 		int sdif = tcp_v4_sdif(skb);
733 		int dif = inet_iif(skb);
734 		int l3index;
735 
736 		/*
737 		 * active side is lost. Try to find listening socket through
738 		 * source port, and then find md5 key through listening socket.
739 		 * we are not loose security here:
740 		 * Incoming packet is checked with md5 hash with finding key,
741 		 * no RST generated if md5 hash doesn't match.
742 		 */
743 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
744 					     ip_hdr(skb)->saddr,
745 					     th->source, ip_hdr(skb)->daddr,
746 					     ntohs(th->source), dif, sdif);
747 		/* don't send rst if it can't find key */
748 		if (!sk1)
749 			goto out;
750 
751 		/* sdif set, means packet ingressed via a device
752 		 * in an L3 domain and dif is set to it.
753 		 */
754 		l3index = sdif ? dif : 0;
755 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
757 		if (!key)
758 			goto out;
759 
760 
761 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
763 			goto out;
764 
765 	}
766 
767 	if (key) {
768 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
769 				   (TCPOPT_NOP << 16) |
770 				   (TCPOPT_MD5SIG << 8) |
771 				   TCPOLEN_MD5SIG);
772 		/* Update length and the length the header thinks exists */
773 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 		rep.th.doff = arg.iov[0].iov_len / 4;
775 
776 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 				     key, ip_hdr(skb)->saddr,
778 				     ip_hdr(skb)->daddr, &rep.th);
779 	}
780 #endif
781 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
782 	if (rep.opt[0] == 0) {
783 		__be32 mrst = mptcp_reset_option(skb);
784 
785 		if (mrst) {
786 			rep.opt[0] = mrst;
787 			arg.iov[0].iov_len += sizeof(mrst);
788 			rep.th.doff = arg.iov[0].iov_len / 4;
789 		}
790 	}
791 
792 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 				      ip_hdr(skb)->saddr, /* XXX */
794 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
797 
798 	/* When socket is gone, all binding information is lost.
799 	 * routing might fail in this case. No choice here, if we choose to force
800 	 * input interface, we will misroute in case of asymmetric route.
801 	 */
802 	if (sk) {
803 		arg.bound_dev_if = sk->sk_bound_dev_if;
804 		if (sk_fullsock(sk))
805 			trace_tcp_send_reset(sk, skb);
806 	}
807 
808 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
810 
811 	arg.tos = ip_hdr(skb)->tos;
812 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
813 	local_bh_disable();
814 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 	sock_net_set(ctl_sk, net);
816 	if (sk) {
817 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
819 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
821 		transmit_time = tcp_transmit_time(sk);
822 		xfrm_sk_clone_policy(ctl_sk, sk);
823 	}
824 	ip_send_unicast_reply(ctl_sk,
825 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
826 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
827 			      &arg, arg.iov[0].iov_len,
828 			      transmit_time);
829 
830 	ctl_sk->sk_mark = 0;
831 	xfrm_sk_free_policy(ctl_sk);
832 	sock_net_set(ctl_sk, &init_net);
833 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
834 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
835 	local_bh_enable();
836 
837 #ifdef CONFIG_TCP_MD5SIG
838 out:
839 	rcu_read_unlock();
840 #endif
841 }
842 
843 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
844    outside socket context is ugly, certainly. What can I do?
845  */
846 
847 static void tcp_v4_send_ack(const struct sock *sk,
848 			    struct sk_buff *skb, u32 seq, u32 ack,
849 			    u32 win, u32 tsval, u32 tsecr, int oif,
850 			    struct tcp_md5sig_key *key,
851 			    int reply_flags, u8 tos)
852 {
853 	const struct tcphdr *th = tcp_hdr(skb);
854 	struct {
855 		struct tcphdr th;
856 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
857 #ifdef CONFIG_TCP_MD5SIG
858 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
859 #endif
860 			];
861 	} rep;
862 	struct net *net = sock_net(sk);
863 	struct ip_reply_arg arg;
864 	struct sock *ctl_sk;
865 	u64 transmit_time;
866 
867 	memset(&rep.th, 0, sizeof(struct tcphdr));
868 	memset(&arg, 0, sizeof(arg));
869 
870 	arg.iov[0].iov_base = (unsigned char *)&rep;
871 	arg.iov[0].iov_len  = sizeof(rep.th);
872 	if (tsecr) {
873 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
874 				   (TCPOPT_TIMESTAMP << 8) |
875 				   TCPOLEN_TIMESTAMP);
876 		rep.opt[1] = htonl(tsval);
877 		rep.opt[2] = htonl(tsecr);
878 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
879 	}
880 
881 	/* Swap the send and the receive. */
882 	rep.th.dest    = th->source;
883 	rep.th.source  = th->dest;
884 	rep.th.doff    = arg.iov[0].iov_len / 4;
885 	rep.th.seq     = htonl(seq);
886 	rep.th.ack_seq = htonl(ack);
887 	rep.th.ack     = 1;
888 	rep.th.window  = htons(win);
889 
890 #ifdef CONFIG_TCP_MD5SIG
891 	if (key) {
892 		int offset = (tsecr) ? 3 : 0;
893 
894 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
895 					  (TCPOPT_NOP << 16) |
896 					  (TCPOPT_MD5SIG << 8) |
897 					  TCPOLEN_MD5SIG);
898 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
899 		rep.th.doff = arg.iov[0].iov_len/4;
900 
901 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
902 				    key, ip_hdr(skb)->saddr,
903 				    ip_hdr(skb)->daddr, &rep.th);
904 	}
905 #endif
906 	arg.flags = reply_flags;
907 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
908 				      ip_hdr(skb)->saddr, /* XXX */
909 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
910 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
911 	if (oif)
912 		arg.bound_dev_if = oif;
913 	arg.tos = tos;
914 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
915 	local_bh_disable();
916 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
917 	sock_net_set(ctl_sk, net);
918 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
919 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
920 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
921 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
922 	transmit_time = tcp_transmit_time(sk);
923 	ip_send_unicast_reply(ctl_sk,
924 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
925 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
926 			      &arg, arg.iov[0].iov_len,
927 			      transmit_time);
928 
929 	ctl_sk->sk_mark = 0;
930 	sock_net_set(ctl_sk, &init_net);
931 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
932 	local_bh_enable();
933 }
934 
935 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
936 {
937 	struct inet_timewait_sock *tw = inet_twsk(sk);
938 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
939 
940 	tcp_v4_send_ack(sk, skb,
941 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
942 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
943 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
944 			tcptw->tw_ts_recent,
945 			tw->tw_bound_dev_if,
946 			tcp_twsk_md5_key(tcptw),
947 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
948 			tw->tw_tos
949 			);
950 
951 	inet_twsk_put(tw);
952 }
953 
954 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
955 				  struct request_sock *req)
956 {
957 	const union tcp_md5_addr *addr;
958 	int l3index;
959 
960 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
961 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
962 	 */
963 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
964 					     tcp_sk(sk)->snd_nxt;
965 
966 	/* RFC 7323 2.3
967 	 * The window field (SEG.WND) of every outgoing segment, with the
968 	 * exception of <SYN> segments, MUST be right-shifted by
969 	 * Rcv.Wind.Shift bits:
970 	 */
971 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
972 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
973 	tcp_v4_send_ack(sk, skb, seq,
974 			tcp_rsk(req)->rcv_nxt,
975 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
976 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
977 			req->ts_recent,
978 			0,
979 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
980 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
981 			ip_hdr(skb)->tos);
982 }
983 
984 /*
985  *	Send a SYN-ACK after having received a SYN.
986  *	This still operates on a request_sock only, not on a big
987  *	socket.
988  */
989 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
990 			      struct flowi *fl,
991 			      struct request_sock *req,
992 			      struct tcp_fastopen_cookie *foc,
993 			      enum tcp_synack_type synack_type,
994 			      struct sk_buff *syn_skb)
995 {
996 	const struct inet_request_sock *ireq = inet_rsk(req);
997 	struct flowi4 fl4;
998 	int err = -1;
999 	struct sk_buff *skb;
1000 	u8 tos;
1001 
1002 	/* First, grab a route. */
1003 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1004 		return -1;
1005 
1006 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1007 
1008 	if (skb) {
1009 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1010 
1011 		tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1012 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1013 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1014 				inet_sk(sk)->tos;
1015 
1016 		if (!INET_ECN_is_capable(tos) &&
1017 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1018 			tos |= INET_ECN_ECT_0;
1019 
1020 		rcu_read_lock();
1021 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1022 					    ireq->ir_rmt_addr,
1023 					    rcu_dereference(ireq->ireq_opt),
1024 					    tos);
1025 		rcu_read_unlock();
1026 		err = net_xmit_eval(err);
1027 	}
1028 
1029 	return err;
1030 }
1031 
1032 /*
1033  *	IPv4 request_sock destructor.
1034  */
1035 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1036 {
1037 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1038 }
1039 
1040 #ifdef CONFIG_TCP_MD5SIG
1041 /*
1042  * RFC2385 MD5 checksumming requires a mapping of
1043  * IP address->MD5 Key.
1044  * We need to maintain these in the sk structure.
1045  */
1046 
1047 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1048 EXPORT_SYMBOL(tcp_md5_needed);
1049 
1050 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1051 {
1052 	if (!old)
1053 		return true;
1054 
1055 	/* l3index always overrides non-l3index */
1056 	if (old->l3index && new->l3index == 0)
1057 		return false;
1058 	if (old->l3index == 0 && new->l3index)
1059 		return true;
1060 
1061 	return old->prefixlen < new->prefixlen;
1062 }
1063 
1064 /* Find the Key structure for an address.  */
1065 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1066 					   const union tcp_md5_addr *addr,
1067 					   int family)
1068 {
1069 	const struct tcp_sock *tp = tcp_sk(sk);
1070 	struct tcp_md5sig_key *key;
1071 	const struct tcp_md5sig_info *md5sig;
1072 	__be32 mask;
1073 	struct tcp_md5sig_key *best_match = NULL;
1074 	bool match;
1075 
1076 	/* caller either holds rcu_read_lock() or socket lock */
1077 	md5sig = rcu_dereference_check(tp->md5sig_info,
1078 				       lockdep_sock_is_held(sk));
1079 	if (!md5sig)
1080 		return NULL;
1081 
1082 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1083 				 lockdep_sock_is_held(sk)) {
1084 		if (key->family != family)
1085 			continue;
1086 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1087 			continue;
1088 		if (family == AF_INET) {
1089 			mask = inet_make_mask(key->prefixlen);
1090 			match = (key->addr.a4.s_addr & mask) ==
1091 				(addr->a4.s_addr & mask);
1092 #if IS_ENABLED(CONFIG_IPV6)
1093 		} else if (family == AF_INET6) {
1094 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1095 						  key->prefixlen);
1096 #endif
1097 		} else {
1098 			match = false;
1099 		}
1100 
1101 		if (match && better_md5_match(best_match, key))
1102 			best_match = key;
1103 	}
1104 	return best_match;
1105 }
1106 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1107 
1108 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1109 						      const union tcp_md5_addr *addr,
1110 						      int family, u8 prefixlen,
1111 						      int l3index, u8 flags)
1112 {
1113 	const struct tcp_sock *tp = tcp_sk(sk);
1114 	struct tcp_md5sig_key *key;
1115 	unsigned int size = sizeof(struct in_addr);
1116 	const struct tcp_md5sig_info *md5sig;
1117 
1118 	/* caller either holds rcu_read_lock() or socket lock */
1119 	md5sig = rcu_dereference_check(tp->md5sig_info,
1120 				       lockdep_sock_is_held(sk));
1121 	if (!md5sig)
1122 		return NULL;
1123 #if IS_ENABLED(CONFIG_IPV6)
1124 	if (family == AF_INET6)
1125 		size = sizeof(struct in6_addr);
1126 #endif
1127 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1128 				 lockdep_sock_is_held(sk)) {
1129 		if (key->family != family)
1130 			continue;
1131 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1132 			continue;
1133 		if (key->l3index != l3index)
1134 			continue;
1135 		if (!memcmp(&key->addr, addr, size) &&
1136 		    key->prefixlen == prefixlen)
1137 			return key;
1138 	}
1139 	return NULL;
1140 }
1141 
1142 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1143 					 const struct sock *addr_sk)
1144 {
1145 	const union tcp_md5_addr *addr;
1146 	int l3index;
1147 
1148 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1149 						 addr_sk->sk_bound_dev_if);
1150 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1151 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1152 }
1153 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1154 
1155 /* This can be called on a newly created socket, from other files */
1156 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1157 		   int family, u8 prefixlen, int l3index, u8 flags,
1158 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1159 {
1160 	/* Add Key to the list */
1161 	struct tcp_md5sig_key *key;
1162 	struct tcp_sock *tp = tcp_sk(sk);
1163 	struct tcp_md5sig_info *md5sig;
1164 
1165 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1166 	if (key) {
1167 		/* Pre-existing entry - just update that one.
1168 		 * Note that the key might be used concurrently.
1169 		 * data_race() is telling kcsan that we do not care of
1170 		 * key mismatches, since changing MD5 key on live flows
1171 		 * can lead to packet drops.
1172 		 */
1173 		data_race(memcpy(key->key, newkey, newkeylen));
1174 
1175 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1176 		 * Also note that a reader could catch new key->keylen value
1177 		 * but old key->key[], this is the reason we use __GFP_ZERO
1178 		 * at sock_kmalloc() time below these lines.
1179 		 */
1180 		WRITE_ONCE(key->keylen, newkeylen);
1181 
1182 		return 0;
1183 	}
1184 
1185 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1186 					   lockdep_sock_is_held(sk));
1187 	if (!md5sig) {
1188 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1189 		if (!md5sig)
1190 			return -ENOMEM;
1191 
1192 		sk_gso_disable(sk);
1193 		INIT_HLIST_HEAD(&md5sig->head);
1194 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1195 	}
1196 
1197 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1198 	if (!key)
1199 		return -ENOMEM;
1200 	if (!tcp_alloc_md5sig_pool()) {
1201 		sock_kfree_s(sk, key, sizeof(*key));
1202 		return -ENOMEM;
1203 	}
1204 
1205 	memcpy(key->key, newkey, newkeylen);
1206 	key->keylen = newkeylen;
1207 	key->family = family;
1208 	key->prefixlen = prefixlen;
1209 	key->l3index = l3index;
1210 	key->flags = flags;
1211 	memcpy(&key->addr, addr,
1212 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1213 								 sizeof(struct in_addr));
1214 	hlist_add_head_rcu(&key->node, &md5sig->head);
1215 	return 0;
1216 }
1217 EXPORT_SYMBOL(tcp_md5_do_add);
1218 
1219 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1220 		   u8 prefixlen, int l3index, u8 flags)
1221 {
1222 	struct tcp_md5sig_key *key;
1223 
1224 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1225 	if (!key)
1226 		return -ENOENT;
1227 	hlist_del_rcu(&key->node);
1228 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1229 	kfree_rcu(key, rcu);
1230 	return 0;
1231 }
1232 EXPORT_SYMBOL(tcp_md5_do_del);
1233 
1234 static void tcp_clear_md5_list(struct sock *sk)
1235 {
1236 	struct tcp_sock *tp = tcp_sk(sk);
1237 	struct tcp_md5sig_key *key;
1238 	struct hlist_node *n;
1239 	struct tcp_md5sig_info *md5sig;
1240 
1241 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1242 
1243 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1244 		hlist_del_rcu(&key->node);
1245 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1246 		kfree_rcu(key, rcu);
1247 	}
1248 }
1249 
1250 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1251 				 sockptr_t optval, int optlen)
1252 {
1253 	struct tcp_md5sig cmd;
1254 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1255 	const union tcp_md5_addr *addr;
1256 	u8 prefixlen = 32;
1257 	int l3index = 0;
1258 	u8 flags;
1259 
1260 	if (optlen < sizeof(cmd))
1261 		return -EINVAL;
1262 
1263 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1264 		return -EFAULT;
1265 
1266 	if (sin->sin_family != AF_INET)
1267 		return -EINVAL;
1268 
1269 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1270 
1271 	if (optname == TCP_MD5SIG_EXT &&
1272 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1273 		prefixlen = cmd.tcpm_prefixlen;
1274 		if (prefixlen > 32)
1275 			return -EINVAL;
1276 	}
1277 
1278 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1279 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1280 		struct net_device *dev;
1281 
1282 		rcu_read_lock();
1283 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1284 		if (dev && netif_is_l3_master(dev))
1285 			l3index = dev->ifindex;
1286 
1287 		rcu_read_unlock();
1288 
1289 		/* ok to reference set/not set outside of rcu;
1290 		 * right now device MUST be an L3 master
1291 		 */
1292 		if (!dev || !l3index)
1293 			return -EINVAL;
1294 	}
1295 
1296 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1297 
1298 	if (!cmd.tcpm_keylen)
1299 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1300 
1301 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1302 		return -EINVAL;
1303 
1304 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1305 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1306 }
1307 
1308 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1309 				   __be32 daddr, __be32 saddr,
1310 				   const struct tcphdr *th, int nbytes)
1311 {
1312 	struct tcp4_pseudohdr *bp;
1313 	struct scatterlist sg;
1314 	struct tcphdr *_th;
1315 
1316 	bp = hp->scratch;
1317 	bp->saddr = saddr;
1318 	bp->daddr = daddr;
1319 	bp->pad = 0;
1320 	bp->protocol = IPPROTO_TCP;
1321 	bp->len = cpu_to_be16(nbytes);
1322 
1323 	_th = (struct tcphdr *)(bp + 1);
1324 	memcpy(_th, th, sizeof(*th));
1325 	_th->check = 0;
1326 
1327 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1328 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1329 				sizeof(*bp) + sizeof(*th));
1330 	return crypto_ahash_update(hp->md5_req);
1331 }
1332 
1333 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1334 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1335 {
1336 	struct tcp_md5sig_pool *hp;
1337 	struct ahash_request *req;
1338 
1339 	hp = tcp_get_md5sig_pool();
1340 	if (!hp)
1341 		goto clear_hash_noput;
1342 	req = hp->md5_req;
1343 
1344 	if (crypto_ahash_init(req))
1345 		goto clear_hash;
1346 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1347 		goto clear_hash;
1348 	if (tcp_md5_hash_key(hp, key))
1349 		goto clear_hash;
1350 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1351 	if (crypto_ahash_final(req))
1352 		goto clear_hash;
1353 
1354 	tcp_put_md5sig_pool();
1355 	return 0;
1356 
1357 clear_hash:
1358 	tcp_put_md5sig_pool();
1359 clear_hash_noput:
1360 	memset(md5_hash, 0, 16);
1361 	return 1;
1362 }
1363 
1364 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1365 			const struct sock *sk,
1366 			const struct sk_buff *skb)
1367 {
1368 	struct tcp_md5sig_pool *hp;
1369 	struct ahash_request *req;
1370 	const struct tcphdr *th = tcp_hdr(skb);
1371 	__be32 saddr, daddr;
1372 
1373 	if (sk) { /* valid for establish/request sockets */
1374 		saddr = sk->sk_rcv_saddr;
1375 		daddr = sk->sk_daddr;
1376 	} else {
1377 		const struct iphdr *iph = ip_hdr(skb);
1378 		saddr = iph->saddr;
1379 		daddr = iph->daddr;
1380 	}
1381 
1382 	hp = tcp_get_md5sig_pool();
1383 	if (!hp)
1384 		goto clear_hash_noput;
1385 	req = hp->md5_req;
1386 
1387 	if (crypto_ahash_init(req))
1388 		goto clear_hash;
1389 
1390 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1391 		goto clear_hash;
1392 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1393 		goto clear_hash;
1394 	if (tcp_md5_hash_key(hp, key))
1395 		goto clear_hash;
1396 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1397 	if (crypto_ahash_final(req))
1398 		goto clear_hash;
1399 
1400 	tcp_put_md5sig_pool();
1401 	return 0;
1402 
1403 clear_hash:
1404 	tcp_put_md5sig_pool();
1405 clear_hash_noput:
1406 	memset(md5_hash, 0, 16);
1407 	return 1;
1408 }
1409 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1410 
1411 #endif
1412 
1413 static void tcp_v4_init_req(struct request_sock *req,
1414 			    const struct sock *sk_listener,
1415 			    struct sk_buff *skb)
1416 {
1417 	struct inet_request_sock *ireq = inet_rsk(req);
1418 	struct net *net = sock_net(sk_listener);
1419 
1420 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1421 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1422 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1423 }
1424 
1425 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1426 					  struct sk_buff *skb,
1427 					  struct flowi *fl,
1428 					  struct request_sock *req)
1429 {
1430 	tcp_v4_init_req(req, sk, skb);
1431 
1432 	if (security_inet_conn_request(sk, skb, req))
1433 		return NULL;
1434 
1435 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1436 }
1437 
1438 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1439 	.family		=	PF_INET,
1440 	.obj_size	=	sizeof(struct tcp_request_sock),
1441 	.rtx_syn_ack	=	tcp_rtx_synack,
1442 	.send_ack	=	tcp_v4_reqsk_send_ack,
1443 	.destructor	=	tcp_v4_reqsk_destructor,
1444 	.send_reset	=	tcp_v4_send_reset,
1445 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1446 };
1447 
1448 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1449 	.mss_clamp	=	TCP_MSS_DEFAULT,
1450 #ifdef CONFIG_TCP_MD5SIG
1451 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1452 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1453 #endif
1454 #ifdef CONFIG_SYN_COOKIES
1455 	.cookie_init_seq =	cookie_v4_init_sequence,
1456 #endif
1457 	.route_req	=	tcp_v4_route_req,
1458 	.init_seq	=	tcp_v4_init_seq,
1459 	.init_ts_off	=	tcp_v4_init_ts_off,
1460 	.send_synack	=	tcp_v4_send_synack,
1461 };
1462 
1463 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1464 {
1465 	/* Never answer to SYNs send to broadcast or multicast */
1466 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1467 		goto drop;
1468 
1469 	return tcp_conn_request(&tcp_request_sock_ops,
1470 				&tcp_request_sock_ipv4_ops, sk, skb);
1471 
1472 drop:
1473 	tcp_listendrop(sk);
1474 	return 0;
1475 }
1476 EXPORT_SYMBOL(tcp_v4_conn_request);
1477 
1478 
1479 /*
1480  * The three way handshake has completed - we got a valid synack -
1481  * now create the new socket.
1482  */
1483 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1484 				  struct request_sock *req,
1485 				  struct dst_entry *dst,
1486 				  struct request_sock *req_unhash,
1487 				  bool *own_req)
1488 {
1489 	struct inet_request_sock *ireq;
1490 	bool found_dup_sk = false;
1491 	struct inet_sock *newinet;
1492 	struct tcp_sock *newtp;
1493 	struct sock *newsk;
1494 #ifdef CONFIG_TCP_MD5SIG
1495 	const union tcp_md5_addr *addr;
1496 	struct tcp_md5sig_key *key;
1497 	int l3index;
1498 #endif
1499 	struct ip_options_rcu *inet_opt;
1500 
1501 	if (sk_acceptq_is_full(sk))
1502 		goto exit_overflow;
1503 
1504 	newsk = tcp_create_openreq_child(sk, req, skb);
1505 	if (!newsk)
1506 		goto exit_nonewsk;
1507 
1508 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1509 	inet_sk_rx_dst_set(newsk, skb);
1510 
1511 	newtp		      = tcp_sk(newsk);
1512 	newinet		      = inet_sk(newsk);
1513 	ireq		      = inet_rsk(req);
1514 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1515 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1516 	newsk->sk_bound_dev_if = ireq->ir_iif;
1517 	newinet->inet_saddr   = ireq->ir_loc_addr;
1518 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1519 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1520 	newinet->mc_index     = inet_iif(skb);
1521 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1522 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1523 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1524 	if (inet_opt)
1525 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1526 	newinet->inet_id = prandom_u32();
1527 
1528 	/* Set ToS of the new socket based upon the value of incoming SYN.
1529 	 * ECT bits are set later in tcp_init_transfer().
1530 	 */
1531 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1532 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1533 
1534 	if (!dst) {
1535 		dst = inet_csk_route_child_sock(sk, newsk, req);
1536 		if (!dst)
1537 			goto put_and_exit;
1538 	} else {
1539 		/* syncookie case : see end of cookie_v4_check() */
1540 	}
1541 	sk_setup_caps(newsk, dst);
1542 
1543 	tcp_ca_openreq_child(newsk, dst);
1544 
1545 	tcp_sync_mss(newsk, dst_mtu(dst));
1546 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1547 
1548 	tcp_initialize_rcv_mss(newsk);
1549 
1550 #ifdef CONFIG_TCP_MD5SIG
1551 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1552 	/* Copy over the MD5 key from the original socket */
1553 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1554 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1555 	if (key) {
1556 		/*
1557 		 * We're using one, so create a matching key
1558 		 * on the newsk structure. If we fail to get
1559 		 * memory, then we end up not copying the key
1560 		 * across. Shucks.
1561 		 */
1562 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1563 			       key->key, key->keylen, GFP_ATOMIC);
1564 		sk_gso_disable(newsk);
1565 	}
1566 #endif
1567 
1568 	if (__inet_inherit_port(sk, newsk) < 0)
1569 		goto put_and_exit;
1570 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1571 				       &found_dup_sk);
1572 	if (likely(*own_req)) {
1573 		tcp_move_syn(newtp, req);
1574 		ireq->ireq_opt = NULL;
1575 	} else {
1576 		newinet->inet_opt = NULL;
1577 
1578 		if (!req_unhash && found_dup_sk) {
1579 			/* This code path should only be executed in the
1580 			 * syncookie case only
1581 			 */
1582 			bh_unlock_sock(newsk);
1583 			sock_put(newsk);
1584 			newsk = NULL;
1585 		}
1586 	}
1587 	return newsk;
1588 
1589 exit_overflow:
1590 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1591 exit_nonewsk:
1592 	dst_release(dst);
1593 exit:
1594 	tcp_listendrop(sk);
1595 	return NULL;
1596 put_and_exit:
1597 	newinet->inet_opt = NULL;
1598 	inet_csk_prepare_forced_close(newsk);
1599 	tcp_done(newsk);
1600 	goto exit;
1601 }
1602 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1603 
1604 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1605 {
1606 #ifdef CONFIG_SYN_COOKIES
1607 	const struct tcphdr *th = tcp_hdr(skb);
1608 
1609 	if (!th->syn)
1610 		sk = cookie_v4_check(sk, skb);
1611 #endif
1612 	return sk;
1613 }
1614 
1615 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1616 			 struct tcphdr *th, u32 *cookie)
1617 {
1618 	u16 mss = 0;
1619 #ifdef CONFIG_SYN_COOKIES
1620 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1621 				    &tcp_request_sock_ipv4_ops, sk, th);
1622 	if (mss) {
1623 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1624 		tcp_synq_overflow(sk);
1625 	}
1626 #endif
1627 	return mss;
1628 }
1629 
1630 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1631 							   u32));
1632 /* The socket must have it's spinlock held when we get
1633  * here, unless it is a TCP_LISTEN socket.
1634  *
1635  * We have a potential double-lock case here, so even when
1636  * doing backlog processing we use the BH locking scheme.
1637  * This is because we cannot sleep with the original spinlock
1638  * held.
1639  */
1640 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1641 {
1642 	enum skb_drop_reason reason;
1643 	struct sock *rsk;
1644 
1645 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1646 		struct dst_entry *dst;
1647 
1648 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1649 						lockdep_sock_is_held(sk));
1650 
1651 		sock_rps_save_rxhash(sk, skb);
1652 		sk_mark_napi_id(sk, skb);
1653 		if (dst) {
1654 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1655 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1656 					     dst, 0)) {
1657 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1658 				dst_release(dst);
1659 			}
1660 		}
1661 		tcp_rcv_established(sk, skb);
1662 		return 0;
1663 	}
1664 
1665 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1666 	if (tcp_checksum_complete(skb))
1667 		goto csum_err;
1668 
1669 	if (sk->sk_state == TCP_LISTEN) {
1670 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1671 
1672 		if (!nsk)
1673 			goto discard;
1674 		if (nsk != sk) {
1675 			if (tcp_child_process(sk, nsk, skb)) {
1676 				rsk = nsk;
1677 				goto reset;
1678 			}
1679 			return 0;
1680 		}
1681 	} else
1682 		sock_rps_save_rxhash(sk, skb);
1683 
1684 	if (tcp_rcv_state_process(sk, skb)) {
1685 		rsk = sk;
1686 		goto reset;
1687 	}
1688 	return 0;
1689 
1690 reset:
1691 	tcp_v4_send_reset(rsk, skb);
1692 discard:
1693 	kfree_skb_reason(skb, reason);
1694 	/* Be careful here. If this function gets more complicated and
1695 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1696 	 * might be destroyed here. This current version compiles correctly,
1697 	 * but you have been warned.
1698 	 */
1699 	return 0;
1700 
1701 csum_err:
1702 	reason = SKB_DROP_REASON_TCP_CSUM;
1703 	trace_tcp_bad_csum(skb);
1704 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1705 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1706 	goto discard;
1707 }
1708 EXPORT_SYMBOL(tcp_v4_do_rcv);
1709 
1710 int tcp_v4_early_demux(struct sk_buff *skb)
1711 {
1712 	const struct iphdr *iph;
1713 	const struct tcphdr *th;
1714 	struct sock *sk;
1715 
1716 	if (skb->pkt_type != PACKET_HOST)
1717 		return 0;
1718 
1719 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1720 		return 0;
1721 
1722 	iph = ip_hdr(skb);
1723 	th = tcp_hdr(skb);
1724 
1725 	if (th->doff < sizeof(struct tcphdr) / 4)
1726 		return 0;
1727 
1728 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1729 				       iph->saddr, th->source,
1730 				       iph->daddr, ntohs(th->dest),
1731 				       skb->skb_iif, inet_sdif(skb));
1732 	if (sk) {
1733 		skb->sk = sk;
1734 		skb->destructor = sock_edemux;
1735 		if (sk_fullsock(sk)) {
1736 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1737 
1738 			if (dst)
1739 				dst = dst_check(dst, 0);
1740 			if (dst &&
1741 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1742 				skb_dst_set_noref(skb, dst);
1743 		}
1744 	}
1745 	return 0;
1746 }
1747 
1748 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1749 		     enum skb_drop_reason *reason)
1750 {
1751 	u32 limit, tail_gso_size, tail_gso_segs;
1752 	struct skb_shared_info *shinfo;
1753 	const struct tcphdr *th;
1754 	struct tcphdr *thtail;
1755 	struct sk_buff *tail;
1756 	unsigned int hdrlen;
1757 	bool fragstolen;
1758 	u32 gso_segs;
1759 	u32 gso_size;
1760 	int delta;
1761 
1762 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1763 	 * we can fix skb->truesize to its real value to avoid future drops.
1764 	 * This is valid because skb is not yet charged to the socket.
1765 	 * It has been noticed pure SACK packets were sometimes dropped
1766 	 * (if cooked by drivers without copybreak feature).
1767 	 */
1768 	skb_condense(skb);
1769 
1770 	skb_dst_drop(skb);
1771 
1772 	if (unlikely(tcp_checksum_complete(skb))) {
1773 		bh_unlock_sock(sk);
1774 		trace_tcp_bad_csum(skb);
1775 		*reason = SKB_DROP_REASON_TCP_CSUM;
1776 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1778 		return true;
1779 	}
1780 
1781 	/* Attempt coalescing to last skb in backlog, even if we are
1782 	 * above the limits.
1783 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1784 	 */
1785 	th = (const struct tcphdr *)skb->data;
1786 	hdrlen = th->doff * 4;
1787 
1788 	tail = sk->sk_backlog.tail;
1789 	if (!tail)
1790 		goto no_coalesce;
1791 	thtail = (struct tcphdr *)tail->data;
1792 
1793 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1794 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1795 	    ((TCP_SKB_CB(tail)->tcp_flags |
1796 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1797 	    !((TCP_SKB_CB(tail)->tcp_flags &
1798 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1799 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1800 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1801 #ifdef CONFIG_TLS_DEVICE
1802 	    tail->decrypted != skb->decrypted ||
1803 #endif
1804 	    thtail->doff != th->doff ||
1805 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1806 		goto no_coalesce;
1807 
1808 	__skb_pull(skb, hdrlen);
1809 
1810 	shinfo = skb_shinfo(skb);
1811 	gso_size = shinfo->gso_size ?: skb->len;
1812 	gso_segs = shinfo->gso_segs ?: 1;
1813 
1814 	shinfo = skb_shinfo(tail);
1815 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1816 	tail_gso_segs = shinfo->gso_segs ?: 1;
1817 
1818 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1819 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1820 
1821 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1822 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1823 			thtail->window = th->window;
1824 		}
1825 
1826 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1827 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1828 		 * is not entered if we append a packet with a FIN.
1829 		 * SYN, RST, URG are not present.
1830 		 * ACK is set on both packets.
1831 		 * PSH : we do not really care in TCP stack,
1832 		 *       at least for 'GRO' packets.
1833 		 */
1834 		thtail->fin |= th->fin;
1835 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1836 
1837 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1838 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1839 			tail->tstamp = skb->tstamp;
1840 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1841 		}
1842 
1843 		/* Not as strict as GRO. We only need to carry mss max value */
1844 		shinfo->gso_size = max(gso_size, tail_gso_size);
1845 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1846 
1847 		sk->sk_backlog.len += delta;
1848 		__NET_INC_STATS(sock_net(sk),
1849 				LINUX_MIB_TCPBACKLOGCOALESCE);
1850 		kfree_skb_partial(skb, fragstolen);
1851 		return false;
1852 	}
1853 	__skb_push(skb, hdrlen);
1854 
1855 no_coalesce:
1856 	/* Only socket owner can try to collapse/prune rx queues
1857 	 * to reduce memory overhead, so add a little headroom here.
1858 	 * Few sockets backlog are possibly concurrently non empty.
1859 	 */
1860 	limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1861 
1862 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1863 		bh_unlock_sock(sk);
1864 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1865 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1866 		return true;
1867 	}
1868 	return false;
1869 }
1870 EXPORT_SYMBOL(tcp_add_backlog);
1871 
1872 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1873 {
1874 	struct tcphdr *th = (struct tcphdr *)skb->data;
1875 
1876 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1877 }
1878 EXPORT_SYMBOL(tcp_filter);
1879 
1880 static void tcp_v4_restore_cb(struct sk_buff *skb)
1881 {
1882 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1883 		sizeof(struct inet_skb_parm));
1884 }
1885 
1886 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1887 			   const struct tcphdr *th)
1888 {
1889 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1890 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1891 	 */
1892 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1893 		sizeof(struct inet_skb_parm));
1894 	barrier();
1895 
1896 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1897 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1898 				    skb->len - th->doff * 4);
1899 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1900 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1901 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1902 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1903 	TCP_SKB_CB(skb)->sacked	 = 0;
1904 	TCP_SKB_CB(skb)->has_rxtstamp =
1905 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1906 }
1907 
1908 /*
1909  *	From tcp_input.c
1910  */
1911 
1912 int tcp_v4_rcv(struct sk_buff *skb)
1913 {
1914 	struct net *net = dev_net(skb->dev);
1915 	enum skb_drop_reason drop_reason;
1916 	int sdif = inet_sdif(skb);
1917 	int dif = inet_iif(skb);
1918 	const struct iphdr *iph;
1919 	const struct tcphdr *th;
1920 	bool refcounted;
1921 	struct sock *sk;
1922 	int ret;
1923 
1924 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1925 	if (skb->pkt_type != PACKET_HOST)
1926 		goto discard_it;
1927 
1928 	/* Count it even if it's bad */
1929 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1930 
1931 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1932 		goto discard_it;
1933 
1934 	th = (const struct tcphdr *)skb->data;
1935 
1936 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1937 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1938 		goto bad_packet;
1939 	}
1940 	if (!pskb_may_pull(skb, th->doff * 4))
1941 		goto discard_it;
1942 
1943 	/* An explanation is required here, I think.
1944 	 * Packet length and doff are validated by header prediction,
1945 	 * provided case of th->doff==0 is eliminated.
1946 	 * So, we defer the checks. */
1947 
1948 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1949 		goto csum_error;
1950 
1951 	th = (const struct tcphdr *)skb->data;
1952 	iph = ip_hdr(skb);
1953 lookup:
1954 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1955 			       th->dest, sdif, &refcounted);
1956 	if (!sk)
1957 		goto no_tcp_socket;
1958 
1959 process:
1960 	if (sk->sk_state == TCP_TIME_WAIT)
1961 		goto do_time_wait;
1962 
1963 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1964 		struct request_sock *req = inet_reqsk(sk);
1965 		bool req_stolen = false;
1966 		struct sock *nsk;
1967 
1968 		sk = req->rsk_listener;
1969 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1970 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1971 		else
1972 			drop_reason = tcp_inbound_md5_hash(sk, skb,
1973 						   &iph->saddr, &iph->daddr,
1974 						   AF_INET, dif, sdif);
1975 		if (unlikely(drop_reason)) {
1976 			sk_drops_add(sk, skb);
1977 			reqsk_put(req);
1978 			goto discard_it;
1979 		}
1980 		if (tcp_checksum_complete(skb)) {
1981 			reqsk_put(req);
1982 			goto csum_error;
1983 		}
1984 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1985 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1986 			if (!nsk) {
1987 				inet_csk_reqsk_queue_drop_and_put(sk, req);
1988 				goto lookup;
1989 			}
1990 			sk = nsk;
1991 			/* reuseport_migrate_sock() has already held one sk_refcnt
1992 			 * before returning.
1993 			 */
1994 		} else {
1995 			/* We own a reference on the listener, increase it again
1996 			 * as we might lose it too soon.
1997 			 */
1998 			sock_hold(sk);
1999 		}
2000 		refcounted = true;
2001 		nsk = NULL;
2002 		if (!tcp_filter(sk, skb)) {
2003 			th = (const struct tcphdr *)skb->data;
2004 			iph = ip_hdr(skb);
2005 			tcp_v4_fill_cb(skb, iph, th);
2006 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2007 		} else {
2008 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2009 		}
2010 		if (!nsk) {
2011 			reqsk_put(req);
2012 			if (req_stolen) {
2013 				/* Another cpu got exclusive access to req
2014 				 * and created a full blown socket.
2015 				 * Try to feed this packet to this socket
2016 				 * instead of discarding it.
2017 				 */
2018 				tcp_v4_restore_cb(skb);
2019 				sock_put(sk);
2020 				goto lookup;
2021 			}
2022 			goto discard_and_relse;
2023 		}
2024 		nf_reset_ct(skb);
2025 		if (nsk == sk) {
2026 			reqsk_put(req);
2027 			tcp_v4_restore_cb(skb);
2028 		} else if (tcp_child_process(sk, nsk, skb)) {
2029 			tcp_v4_send_reset(nsk, skb);
2030 			goto discard_and_relse;
2031 		} else {
2032 			sock_put(sk);
2033 			return 0;
2034 		}
2035 	}
2036 
2037 	if (static_branch_unlikely(&ip4_min_ttl)) {
2038 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2039 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2040 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2041 			goto discard_and_relse;
2042 		}
2043 	}
2044 
2045 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2046 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2047 		goto discard_and_relse;
2048 	}
2049 
2050 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2051 					   &iph->daddr, AF_INET, dif, sdif);
2052 	if (drop_reason)
2053 		goto discard_and_relse;
2054 
2055 	nf_reset_ct(skb);
2056 
2057 	if (tcp_filter(sk, skb)) {
2058 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2059 		goto discard_and_relse;
2060 	}
2061 	th = (const struct tcphdr *)skb->data;
2062 	iph = ip_hdr(skb);
2063 	tcp_v4_fill_cb(skb, iph, th);
2064 
2065 	skb->dev = NULL;
2066 
2067 	if (sk->sk_state == TCP_LISTEN) {
2068 		ret = tcp_v4_do_rcv(sk, skb);
2069 		goto put_and_return;
2070 	}
2071 
2072 	sk_incoming_cpu_update(sk);
2073 
2074 	bh_lock_sock_nested(sk);
2075 	tcp_segs_in(tcp_sk(sk), skb);
2076 	ret = 0;
2077 	if (!sock_owned_by_user(sk)) {
2078 		ret = tcp_v4_do_rcv(sk, skb);
2079 	} else {
2080 		if (tcp_add_backlog(sk, skb, &drop_reason))
2081 			goto discard_and_relse;
2082 	}
2083 	bh_unlock_sock(sk);
2084 
2085 put_and_return:
2086 	if (refcounted)
2087 		sock_put(sk);
2088 
2089 	return ret;
2090 
2091 no_tcp_socket:
2092 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2093 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2094 		goto discard_it;
2095 
2096 	tcp_v4_fill_cb(skb, iph, th);
2097 
2098 	if (tcp_checksum_complete(skb)) {
2099 csum_error:
2100 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2101 		trace_tcp_bad_csum(skb);
2102 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2103 bad_packet:
2104 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2105 	} else {
2106 		tcp_v4_send_reset(NULL, skb);
2107 	}
2108 
2109 discard_it:
2110 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2111 	/* Discard frame. */
2112 	kfree_skb_reason(skb, drop_reason);
2113 	return 0;
2114 
2115 discard_and_relse:
2116 	sk_drops_add(sk, skb);
2117 	if (refcounted)
2118 		sock_put(sk);
2119 	goto discard_it;
2120 
2121 do_time_wait:
2122 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2123 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2124 		inet_twsk_put(inet_twsk(sk));
2125 		goto discard_it;
2126 	}
2127 
2128 	tcp_v4_fill_cb(skb, iph, th);
2129 
2130 	if (tcp_checksum_complete(skb)) {
2131 		inet_twsk_put(inet_twsk(sk));
2132 		goto csum_error;
2133 	}
2134 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2135 	case TCP_TW_SYN: {
2136 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2137 							&tcp_hashinfo, skb,
2138 							__tcp_hdrlen(th),
2139 							iph->saddr, th->source,
2140 							iph->daddr, th->dest,
2141 							inet_iif(skb),
2142 							sdif);
2143 		if (sk2) {
2144 			inet_twsk_deschedule_put(inet_twsk(sk));
2145 			sk = sk2;
2146 			tcp_v4_restore_cb(skb);
2147 			refcounted = false;
2148 			goto process;
2149 		}
2150 	}
2151 		/* to ACK */
2152 		fallthrough;
2153 	case TCP_TW_ACK:
2154 		tcp_v4_timewait_ack(sk, skb);
2155 		break;
2156 	case TCP_TW_RST:
2157 		tcp_v4_send_reset(sk, skb);
2158 		inet_twsk_deschedule_put(inet_twsk(sk));
2159 		goto discard_it;
2160 	case TCP_TW_SUCCESS:;
2161 	}
2162 	goto discard_it;
2163 }
2164 
2165 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2166 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2167 	.twsk_unique	= tcp_twsk_unique,
2168 	.twsk_destructor= tcp_twsk_destructor,
2169 };
2170 
2171 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2172 {
2173 	struct dst_entry *dst = skb_dst(skb);
2174 
2175 	if (dst && dst_hold_safe(dst)) {
2176 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2177 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2178 	}
2179 }
2180 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2181 
2182 const struct inet_connection_sock_af_ops ipv4_specific = {
2183 	.queue_xmit	   = ip_queue_xmit,
2184 	.send_check	   = tcp_v4_send_check,
2185 	.rebuild_header	   = inet_sk_rebuild_header,
2186 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2187 	.conn_request	   = tcp_v4_conn_request,
2188 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2189 	.net_header_len	   = sizeof(struct iphdr),
2190 	.setsockopt	   = ip_setsockopt,
2191 	.getsockopt	   = ip_getsockopt,
2192 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2193 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2194 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2195 };
2196 EXPORT_SYMBOL(ipv4_specific);
2197 
2198 #ifdef CONFIG_TCP_MD5SIG
2199 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2200 	.md5_lookup		= tcp_v4_md5_lookup,
2201 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2202 	.md5_parse		= tcp_v4_parse_md5_keys,
2203 };
2204 #endif
2205 
2206 /* NOTE: A lot of things set to zero explicitly by call to
2207  *       sk_alloc() so need not be done here.
2208  */
2209 static int tcp_v4_init_sock(struct sock *sk)
2210 {
2211 	struct inet_connection_sock *icsk = inet_csk(sk);
2212 
2213 	tcp_init_sock(sk);
2214 
2215 	icsk->icsk_af_ops = &ipv4_specific;
2216 
2217 #ifdef CONFIG_TCP_MD5SIG
2218 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2219 #endif
2220 
2221 	return 0;
2222 }
2223 
2224 void tcp_v4_destroy_sock(struct sock *sk)
2225 {
2226 	struct tcp_sock *tp = tcp_sk(sk);
2227 
2228 	trace_tcp_destroy_sock(sk);
2229 
2230 	tcp_clear_xmit_timers(sk);
2231 
2232 	tcp_cleanup_congestion_control(sk);
2233 
2234 	tcp_cleanup_ulp(sk);
2235 
2236 	/* Cleanup up the write buffer. */
2237 	tcp_write_queue_purge(sk);
2238 
2239 	/* Check if we want to disable active TFO */
2240 	tcp_fastopen_active_disable_ofo_check(sk);
2241 
2242 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2243 	skb_rbtree_purge(&tp->out_of_order_queue);
2244 
2245 #ifdef CONFIG_TCP_MD5SIG
2246 	/* Clean up the MD5 key list, if any */
2247 	if (tp->md5sig_info) {
2248 		tcp_clear_md5_list(sk);
2249 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2250 		tp->md5sig_info = NULL;
2251 	}
2252 #endif
2253 
2254 	/* Clean up a referenced TCP bind bucket. */
2255 	if (inet_csk(sk)->icsk_bind_hash)
2256 		inet_put_port(sk);
2257 
2258 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2259 
2260 	/* If socket is aborted during connect operation */
2261 	tcp_free_fastopen_req(tp);
2262 	tcp_fastopen_destroy_cipher(sk);
2263 	tcp_saved_syn_free(tp);
2264 
2265 	sk_sockets_allocated_dec(sk);
2266 }
2267 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2268 
2269 #ifdef CONFIG_PROC_FS
2270 /* Proc filesystem TCP sock list dumping. */
2271 
2272 static unsigned short seq_file_family(const struct seq_file *seq);
2273 
2274 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2275 {
2276 	unsigned short family = seq_file_family(seq);
2277 
2278 	/* AF_UNSPEC is used as a match all */
2279 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2280 		net_eq(sock_net(sk), seq_file_net(seq)));
2281 }
2282 
2283 /* Find a non empty bucket (starting from st->bucket)
2284  * and return the first sk from it.
2285  */
2286 static void *listening_get_first(struct seq_file *seq)
2287 {
2288 	struct tcp_iter_state *st = seq->private;
2289 
2290 	st->offset = 0;
2291 	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2292 		struct inet_listen_hashbucket *ilb2;
2293 		struct hlist_nulls_node *node;
2294 		struct sock *sk;
2295 
2296 		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2297 		if (hlist_nulls_empty(&ilb2->nulls_head))
2298 			continue;
2299 
2300 		spin_lock(&ilb2->lock);
2301 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2302 			if (seq_sk_match(seq, sk))
2303 				return sk;
2304 		}
2305 		spin_unlock(&ilb2->lock);
2306 	}
2307 
2308 	return NULL;
2309 }
2310 
2311 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2312  * If "cur" is the last one in the st->bucket,
2313  * call listening_get_first() to return the first sk of the next
2314  * non empty bucket.
2315  */
2316 static void *listening_get_next(struct seq_file *seq, void *cur)
2317 {
2318 	struct tcp_iter_state *st = seq->private;
2319 	struct inet_listen_hashbucket *ilb2;
2320 	struct hlist_nulls_node *node;
2321 	struct sock *sk = cur;
2322 
2323 	++st->num;
2324 	++st->offset;
2325 
2326 	sk = sk_nulls_next(sk);
2327 	sk_nulls_for_each_from(sk, node) {
2328 		if (seq_sk_match(seq, sk))
2329 			return sk;
2330 	}
2331 
2332 	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2333 	spin_unlock(&ilb2->lock);
2334 	++st->bucket;
2335 	return listening_get_first(seq);
2336 }
2337 
2338 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2339 {
2340 	struct tcp_iter_state *st = seq->private;
2341 	void *rc;
2342 
2343 	st->bucket = 0;
2344 	st->offset = 0;
2345 	rc = listening_get_first(seq);
2346 
2347 	while (rc && *pos) {
2348 		rc = listening_get_next(seq, rc);
2349 		--*pos;
2350 	}
2351 	return rc;
2352 }
2353 
2354 static inline bool empty_bucket(const struct tcp_iter_state *st)
2355 {
2356 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2357 }
2358 
2359 /*
2360  * Get first established socket starting from bucket given in st->bucket.
2361  * If st->bucket is zero, the very first socket in the hash is returned.
2362  */
2363 static void *established_get_first(struct seq_file *seq)
2364 {
2365 	struct tcp_iter_state *st = seq->private;
2366 
2367 	st->offset = 0;
2368 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2369 		struct sock *sk;
2370 		struct hlist_nulls_node *node;
2371 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2372 
2373 		/* Lockless fast path for the common case of empty buckets */
2374 		if (empty_bucket(st))
2375 			continue;
2376 
2377 		spin_lock_bh(lock);
2378 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2379 			if (seq_sk_match(seq, sk))
2380 				return sk;
2381 		}
2382 		spin_unlock_bh(lock);
2383 	}
2384 
2385 	return NULL;
2386 }
2387 
2388 static void *established_get_next(struct seq_file *seq, void *cur)
2389 {
2390 	struct sock *sk = cur;
2391 	struct hlist_nulls_node *node;
2392 	struct tcp_iter_state *st = seq->private;
2393 
2394 	++st->num;
2395 	++st->offset;
2396 
2397 	sk = sk_nulls_next(sk);
2398 
2399 	sk_nulls_for_each_from(sk, node) {
2400 		if (seq_sk_match(seq, sk))
2401 			return sk;
2402 	}
2403 
2404 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2405 	++st->bucket;
2406 	return established_get_first(seq);
2407 }
2408 
2409 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2410 {
2411 	struct tcp_iter_state *st = seq->private;
2412 	void *rc;
2413 
2414 	st->bucket = 0;
2415 	rc = established_get_first(seq);
2416 
2417 	while (rc && pos) {
2418 		rc = established_get_next(seq, rc);
2419 		--pos;
2420 	}
2421 	return rc;
2422 }
2423 
2424 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2425 {
2426 	void *rc;
2427 	struct tcp_iter_state *st = seq->private;
2428 
2429 	st->state = TCP_SEQ_STATE_LISTENING;
2430 	rc	  = listening_get_idx(seq, &pos);
2431 
2432 	if (!rc) {
2433 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2434 		rc	  = established_get_idx(seq, pos);
2435 	}
2436 
2437 	return rc;
2438 }
2439 
2440 static void *tcp_seek_last_pos(struct seq_file *seq)
2441 {
2442 	struct tcp_iter_state *st = seq->private;
2443 	int bucket = st->bucket;
2444 	int offset = st->offset;
2445 	int orig_num = st->num;
2446 	void *rc = NULL;
2447 
2448 	switch (st->state) {
2449 	case TCP_SEQ_STATE_LISTENING:
2450 		if (st->bucket > tcp_hashinfo.lhash2_mask)
2451 			break;
2452 		st->state = TCP_SEQ_STATE_LISTENING;
2453 		rc = listening_get_first(seq);
2454 		while (offset-- && rc && bucket == st->bucket)
2455 			rc = listening_get_next(seq, rc);
2456 		if (rc)
2457 			break;
2458 		st->bucket = 0;
2459 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2460 		fallthrough;
2461 	case TCP_SEQ_STATE_ESTABLISHED:
2462 		if (st->bucket > tcp_hashinfo.ehash_mask)
2463 			break;
2464 		rc = established_get_first(seq);
2465 		while (offset-- && rc && bucket == st->bucket)
2466 			rc = established_get_next(seq, rc);
2467 	}
2468 
2469 	st->num = orig_num;
2470 
2471 	return rc;
2472 }
2473 
2474 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2475 {
2476 	struct tcp_iter_state *st = seq->private;
2477 	void *rc;
2478 
2479 	if (*pos && *pos == st->last_pos) {
2480 		rc = tcp_seek_last_pos(seq);
2481 		if (rc)
2482 			goto out;
2483 	}
2484 
2485 	st->state = TCP_SEQ_STATE_LISTENING;
2486 	st->num = 0;
2487 	st->bucket = 0;
2488 	st->offset = 0;
2489 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2490 
2491 out:
2492 	st->last_pos = *pos;
2493 	return rc;
2494 }
2495 EXPORT_SYMBOL(tcp_seq_start);
2496 
2497 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2498 {
2499 	struct tcp_iter_state *st = seq->private;
2500 	void *rc = NULL;
2501 
2502 	if (v == SEQ_START_TOKEN) {
2503 		rc = tcp_get_idx(seq, 0);
2504 		goto out;
2505 	}
2506 
2507 	switch (st->state) {
2508 	case TCP_SEQ_STATE_LISTENING:
2509 		rc = listening_get_next(seq, v);
2510 		if (!rc) {
2511 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2512 			st->bucket = 0;
2513 			st->offset = 0;
2514 			rc	  = established_get_first(seq);
2515 		}
2516 		break;
2517 	case TCP_SEQ_STATE_ESTABLISHED:
2518 		rc = established_get_next(seq, v);
2519 		break;
2520 	}
2521 out:
2522 	++*pos;
2523 	st->last_pos = *pos;
2524 	return rc;
2525 }
2526 EXPORT_SYMBOL(tcp_seq_next);
2527 
2528 void tcp_seq_stop(struct seq_file *seq, void *v)
2529 {
2530 	struct tcp_iter_state *st = seq->private;
2531 
2532 	switch (st->state) {
2533 	case TCP_SEQ_STATE_LISTENING:
2534 		if (v != SEQ_START_TOKEN)
2535 			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2536 		break;
2537 	case TCP_SEQ_STATE_ESTABLISHED:
2538 		if (v)
2539 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2540 		break;
2541 	}
2542 }
2543 EXPORT_SYMBOL(tcp_seq_stop);
2544 
2545 static void get_openreq4(const struct request_sock *req,
2546 			 struct seq_file *f, int i)
2547 {
2548 	const struct inet_request_sock *ireq = inet_rsk(req);
2549 	long delta = req->rsk_timer.expires - jiffies;
2550 
2551 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2552 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2553 		i,
2554 		ireq->ir_loc_addr,
2555 		ireq->ir_num,
2556 		ireq->ir_rmt_addr,
2557 		ntohs(ireq->ir_rmt_port),
2558 		TCP_SYN_RECV,
2559 		0, 0, /* could print option size, but that is af dependent. */
2560 		1,    /* timers active (only the expire timer) */
2561 		jiffies_delta_to_clock_t(delta),
2562 		req->num_timeout,
2563 		from_kuid_munged(seq_user_ns(f),
2564 				 sock_i_uid(req->rsk_listener)),
2565 		0,  /* non standard timer */
2566 		0, /* open_requests have no inode */
2567 		0,
2568 		req);
2569 }
2570 
2571 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2572 {
2573 	int timer_active;
2574 	unsigned long timer_expires;
2575 	const struct tcp_sock *tp = tcp_sk(sk);
2576 	const struct inet_connection_sock *icsk = inet_csk(sk);
2577 	const struct inet_sock *inet = inet_sk(sk);
2578 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2579 	__be32 dest = inet->inet_daddr;
2580 	__be32 src = inet->inet_rcv_saddr;
2581 	__u16 destp = ntohs(inet->inet_dport);
2582 	__u16 srcp = ntohs(inet->inet_sport);
2583 	int rx_queue;
2584 	int state;
2585 
2586 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2587 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2588 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2589 		timer_active	= 1;
2590 		timer_expires	= icsk->icsk_timeout;
2591 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2592 		timer_active	= 4;
2593 		timer_expires	= icsk->icsk_timeout;
2594 	} else if (timer_pending(&sk->sk_timer)) {
2595 		timer_active	= 2;
2596 		timer_expires	= sk->sk_timer.expires;
2597 	} else {
2598 		timer_active	= 0;
2599 		timer_expires = jiffies;
2600 	}
2601 
2602 	state = inet_sk_state_load(sk);
2603 	if (state == TCP_LISTEN)
2604 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2605 	else
2606 		/* Because we don't lock the socket,
2607 		 * we might find a transient negative value.
2608 		 */
2609 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2610 				      READ_ONCE(tp->copied_seq), 0);
2611 
2612 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2613 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2614 		i, src, srcp, dest, destp, state,
2615 		READ_ONCE(tp->write_seq) - tp->snd_una,
2616 		rx_queue,
2617 		timer_active,
2618 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2619 		icsk->icsk_retransmits,
2620 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2621 		icsk->icsk_probes_out,
2622 		sock_i_ino(sk),
2623 		refcount_read(&sk->sk_refcnt), sk,
2624 		jiffies_to_clock_t(icsk->icsk_rto),
2625 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2626 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2627 		tcp_snd_cwnd(tp),
2628 		state == TCP_LISTEN ?
2629 		    fastopenq->max_qlen :
2630 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2631 }
2632 
2633 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2634 			       struct seq_file *f, int i)
2635 {
2636 	long delta = tw->tw_timer.expires - jiffies;
2637 	__be32 dest, src;
2638 	__u16 destp, srcp;
2639 
2640 	dest  = tw->tw_daddr;
2641 	src   = tw->tw_rcv_saddr;
2642 	destp = ntohs(tw->tw_dport);
2643 	srcp  = ntohs(tw->tw_sport);
2644 
2645 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2646 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2647 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2648 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2649 		refcount_read(&tw->tw_refcnt), tw);
2650 }
2651 
2652 #define TMPSZ 150
2653 
2654 static int tcp4_seq_show(struct seq_file *seq, void *v)
2655 {
2656 	struct tcp_iter_state *st;
2657 	struct sock *sk = v;
2658 
2659 	seq_setwidth(seq, TMPSZ - 1);
2660 	if (v == SEQ_START_TOKEN) {
2661 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2662 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2663 			   "inode");
2664 		goto out;
2665 	}
2666 	st = seq->private;
2667 
2668 	if (sk->sk_state == TCP_TIME_WAIT)
2669 		get_timewait4_sock(v, seq, st->num);
2670 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2671 		get_openreq4(v, seq, st->num);
2672 	else
2673 		get_tcp4_sock(v, seq, st->num);
2674 out:
2675 	seq_pad(seq, '\n');
2676 	return 0;
2677 }
2678 
2679 #ifdef CONFIG_BPF_SYSCALL
2680 struct bpf_tcp_iter_state {
2681 	struct tcp_iter_state state;
2682 	unsigned int cur_sk;
2683 	unsigned int end_sk;
2684 	unsigned int max_sk;
2685 	struct sock **batch;
2686 	bool st_bucket_done;
2687 };
2688 
2689 struct bpf_iter__tcp {
2690 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2691 	__bpf_md_ptr(struct sock_common *, sk_common);
2692 	uid_t uid __aligned(8);
2693 };
2694 
2695 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2696 			     struct sock_common *sk_common, uid_t uid)
2697 {
2698 	struct bpf_iter__tcp ctx;
2699 
2700 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2701 	ctx.meta = meta;
2702 	ctx.sk_common = sk_common;
2703 	ctx.uid = uid;
2704 	return bpf_iter_run_prog(prog, &ctx);
2705 }
2706 
2707 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2708 {
2709 	while (iter->cur_sk < iter->end_sk)
2710 		sock_put(iter->batch[iter->cur_sk++]);
2711 }
2712 
2713 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2714 				      unsigned int new_batch_sz)
2715 {
2716 	struct sock **new_batch;
2717 
2718 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2719 			     GFP_USER | __GFP_NOWARN);
2720 	if (!new_batch)
2721 		return -ENOMEM;
2722 
2723 	bpf_iter_tcp_put_batch(iter);
2724 	kvfree(iter->batch);
2725 	iter->batch = new_batch;
2726 	iter->max_sk = new_batch_sz;
2727 
2728 	return 0;
2729 }
2730 
2731 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2732 						 struct sock *start_sk)
2733 {
2734 	struct bpf_tcp_iter_state *iter = seq->private;
2735 	struct tcp_iter_state *st = &iter->state;
2736 	struct hlist_nulls_node *node;
2737 	unsigned int expected = 1;
2738 	struct sock *sk;
2739 
2740 	sock_hold(start_sk);
2741 	iter->batch[iter->end_sk++] = start_sk;
2742 
2743 	sk = sk_nulls_next(start_sk);
2744 	sk_nulls_for_each_from(sk, node) {
2745 		if (seq_sk_match(seq, sk)) {
2746 			if (iter->end_sk < iter->max_sk) {
2747 				sock_hold(sk);
2748 				iter->batch[iter->end_sk++] = sk;
2749 			}
2750 			expected++;
2751 		}
2752 	}
2753 	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2754 
2755 	return expected;
2756 }
2757 
2758 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2759 						   struct sock *start_sk)
2760 {
2761 	struct bpf_tcp_iter_state *iter = seq->private;
2762 	struct tcp_iter_state *st = &iter->state;
2763 	struct hlist_nulls_node *node;
2764 	unsigned int expected = 1;
2765 	struct sock *sk;
2766 
2767 	sock_hold(start_sk);
2768 	iter->batch[iter->end_sk++] = start_sk;
2769 
2770 	sk = sk_nulls_next(start_sk);
2771 	sk_nulls_for_each_from(sk, node) {
2772 		if (seq_sk_match(seq, sk)) {
2773 			if (iter->end_sk < iter->max_sk) {
2774 				sock_hold(sk);
2775 				iter->batch[iter->end_sk++] = sk;
2776 			}
2777 			expected++;
2778 		}
2779 	}
2780 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2781 
2782 	return expected;
2783 }
2784 
2785 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2786 {
2787 	struct bpf_tcp_iter_state *iter = seq->private;
2788 	struct tcp_iter_state *st = &iter->state;
2789 	unsigned int expected;
2790 	bool resized = false;
2791 	struct sock *sk;
2792 
2793 	/* The st->bucket is done.  Directly advance to the next
2794 	 * bucket instead of having the tcp_seek_last_pos() to skip
2795 	 * one by one in the current bucket and eventually find out
2796 	 * it has to advance to the next bucket.
2797 	 */
2798 	if (iter->st_bucket_done) {
2799 		st->offset = 0;
2800 		st->bucket++;
2801 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2802 		    st->bucket > tcp_hashinfo.lhash2_mask) {
2803 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2804 			st->bucket = 0;
2805 		}
2806 	}
2807 
2808 again:
2809 	/* Get a new batch */
2810 	iter->cur_sk = 0;
2811 	iter->end_sk = 0;
2812 	iter->st_bucket_done = false;
2813 
2814 	sk = tcp_seek_last_pos(seq);
2815 	if (!sk)
2816 		return NULL; /* Done */
2817 
2818 	if (st->state == TCP_SEQ_STATE_LISTENING)
2819 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2820 	else
2821 		expected = bpf_iter_tcp_established_batch(seq, sk);
2822 
2823 	if (iter->end_sk == expected) {
2824 		iter->st_bucket_done = true;
2825 		return sk;
2826 	}
2827 
2828 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2829 		resized = true;
2830 		goto again;
2831 	}
2832 
2833 	return sk;
2834 }
2835 
2836 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2837 {
2838 	/* bpf iter does not support lseek, so it always
2839 	 * continue from where it was stop()-ped.
2840 	 */
2841 	if (*pos)
2842 		return bpf_iter_tcp_batch(seq);
2843 
2844 	return SEQ_START_TOKEN;
2845 }
2846 
2847 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2848 {
2849 	struct bpf_tcp_iter_state *iter = seq->private;
2850 	struct tcp_iter_state *st = &iter->state;
2851 	struct sock *sk;
2852 
2853 	/* Whenever seq_next() is called, the iter->cur_sk is
2854 	 * done with seq_show(), so advance to the next sk in
2855 	 * the batch.
2856 	 */
2857 	if (iter->cur_sk < iter->end_sk) {
2858 		/* Keeping st->num consistent in tcp_iter_state.
2859 		 * bpf_iter_tcp does not use st->num.
2860 		 * meta.seq_num is used instead.
2861 		 */
2862 		st->num++;
2863 		/* Move st->offset to the next sk in the bucket such that
2864 		 * the future start() will resume at st->offset in
2865 		 * st->bucket.  See tcp_seek_last_pos().
2866 		 */
2867 		st->offset++;
2868 		sock_put(iter->batch[iter->cur_sk++]);
2869 	}
2870 
2871 	if (iter->cur_sk < iter->end_sk)
2872 		sk = iter->batch[iter->cur_sk];
2873 	else
2874 		sk = bpf_iter_tcp_batch(seq);
2875 
2876 	++*pos;
2877 	/* Keeping st->last_pos consistent in tcp_iter_state.
2878 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2879 	 */
2880 	st->last_pos = *pos;
2881 	return sk;
2882 }
2883 
2884 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2885 {
2886 	struct bpf_iter_meta meta;
2887 	struct bpf_prog *prog;
2888 	struct sock *sk = v;
2889 	bool slow;
2890 	uid_t uid;
2891 	int ret;
2892 
2893 	if (v == SEQ_START_TOKEN)
2894 		return 0;
2895 
2896 	if (sk_fullsock(sk))
2897 		slow = lock_sock_fast(sk);
2898 
2899 	if (unlikely(sk_unhashed(sk))) {
2900 		ret = SEQ_SKIP;
2901 		goto unlock;
2902 	}
2903 
2904 	if (sk->sk_state == TCP_TIME_WAIT) {
2905 		uid = 0;
2906 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2907 		const struct request_sock *req = v;
2908 
2909 		uid = from_kuid_munged(seq_user_ns(seq),
2910 				       sock_i_uid(req->rsk_listener));
2911 	} else {
2912 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2913 	}
2914 
2915 	meta.seq = seq;
2916 	prog = bpf_iter_get_info(&meta, false);
2917 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2918 
2919 unlock:
2920 	if (sk_fullsock(sk))
2921 		unlock_sock_fast(sk, slow);
2922 	return ret;
2923 
2924 }
2925 
2926 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2927 {
2928 	struct bpf_tcp_iter_state *iter = seq->private;
2929 	struct bpf_iter_meta meta;
2930 	struct bpf_prog *prog;
2931 
2932 	if (!v) {
2933 		meta.seq = seq;
2934 		prog = bpf_iter_get_info(&meta, true);
2935 		if (prog)
2936 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2937 	}
2938 
2939 	if (iter->cur_sk < iter->end_sk) {
2940 		bpf_iter_tcp_put_batch(iter);
2941 		iter->st_bucket_done = false;
2942 	}
2943 }
2944 
2945 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2946 	.show		= bpf_iter_tcp_seq_show,
2947 	.start		= bpf_iter_tcp_seq_start,
2948 	.next		= bpf_iter_tcp_seq_next,
2949 	.stop		= bpf_iter_tcp_seq_stop,
2950 };
2951 #endif
2952 static unsigned short seq_file_family(const struct seq_file *seq)
2953 {
2954 	const struct tcp_seq_afinfo *afinfo;
2955 
2956 #ifdef CONFIG_BPF_SYSCALL
2957 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2958 	if (seq->op == &bpf_iter_tcp_seq_ops)
2959 		return AF_UNSPEC;
2960 #endif
2961 
2962 	/* Iterated from proc fs */
2963 	afinfo = pde_data(file_inode(seq->file));
2964 	return afinfo->family;
2965 }
2966 
2967 static const struct seq_operations tcp4_seq_ops = {
2968 	.show		= tcp4_seq_show,
2969 	.start		= tcp_seq_start,
2970 	.next		= tcp_seq_next,
2971 	.stop		= tcp_seq_stop,
2972 };
2973 
2974 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2975 	.family		= AF_INET,
2976 };
2977 
2978 static int __net_init tcp4_proc_init_net(struct net *net)
2979 {
2980 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2981 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2982 		return -ENOMEM;
2983 	return 0;
2984 }
2985 
2986 static void __net_exit tcp4_proc_exit_net(struct net *net)
2987 {
2988 	remove_proc_entry("tcp", net->proc_net);
2989 }
2990 
2991 static struct pernet_operations tcp4_net_ops = {
2992 	.init = tcp4_proc_init_net,
2993 	.exit = tcp4_proc_exit_net,
2994 };
2995 
2996 int __init tcp4_proc_init(void)
2997 {
2998 	return register_pernet_subsys(&tcp4_net_ops);
2999 }
3000 
3001 void tcp4_proc_exit(void)
3002 {
3003 	unregister_pernet_subsys(&tcp4_net_ops);
3004 }
3005 #endif /* CONFIG_PROC_FS */
3006 
3007 /* @wake is one when sk_stream_write_space() calls us.
3008  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3009  * This mimics the strategy used in sock_def_write_space().
3010  */
3011 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3012 {
3013 	const struct tcp_sock *tp = tcp_sk(sk);
3014 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3015 			    READ_ONCE(tp->snd_nxt);
3016 
3017 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3018 }
3019 EXPORT_SYMBOL(tcp_stream_memory_free);
3020 
3021 struct proto tcp_prot = {
3022 	.name			= "TCP",
3023 	.owner			= THIS_MODULE,
3024 	.close			= tcp_close,
3025 	.pre_connect		= tcp_v4_pre_connect,
3026 	.connect		= tcp_v4_connect,
3027 	.disconnect		= tcp_disconnect,
3028 	.accept			= inet_csk_accept,
3029 	.ioctl			= tcp_ioctl,
3030 	.init			= tcp_v4_init_sock,
3031 	.destroy		= tcp_v4_destroy_sock,
3032 	.shutdown		= tcp_shutdown,
3033 	.setsockopt		= tcp_setsockopt,
3034 	.getsockopt		= tcp_getsockopt,
3035 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3036 	.keepalive		= tcp_set_keepalive,
3037 	.recvmsg		= tcp_recvmsg,
3038 	.sendmsg		= tcp_sendmsg,
3039 	.sendpage		= tcp_sendpage,
3040 	.backlog_rcv		= tcp_v4_do_rcv,
3041 	.release_cb		= tcp_release_cb,
3042 	.hash			= inet_hash,
3043 	.unhash			= inet_unhash,
3044 	.get_port		= inet_csk_get_port,
3045 	.put_port		= inet_put_port,
3046 #ifdef CONFIG_BPF_SYSCALL
3047 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3048 #endif
3049 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3050 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3051 	.stream_memory_free	= tcp_stream_memory_free,
3052 	.sockets_allocated	= &tcp_sockets_allocated,
3053 	.orphan_count		= &tcp_orphan_count,
3054 
3055 	.memory_allocated	= &tcp_memory_allocated,
3056 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3057 
3058 	.memory_pressure	= &tcp_memory_pressure,
3059 	.sysctl_mem		= sysctl_tcp_mem,
3060 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3061 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3062 	.max_header		= MAX_TCP_HEADER,
3063 	.obj_size		= sizeof(struct tcp_sock),
3064 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3065 	.twsk_prot		= &tcp_timewait_sock_ops,
3066 	.rsk_prot		= &tcp_request_sock_ops,
3067 	.h.hashinfo		= &tcp_hashinfo,
3068 	.no_autobind		= true,
3069 	.diag_destroy		= tcp_abort,
3070 };
3071 EXPORT_SYMBOL(tcp_prot);
3072 
3073 static void __net_exit tcp_sk_exit(struct net *net)
3074 {
3075 	struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3076 
3077 	if (net->ipv4.tcp_congestion_control)
3078 		bpf_module_put(net->ipv4.tcp_congestion_control,
3079 			       net->ipv4.tcp_congestion_control->owner);
3080 	if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3081 		kfree(tcp_death_row);
3082 }
3083 
3084 static int __net_init tcp_sk_init(struct net *net)
3085 {
3086 	int cnt;
3087 
3088 	net->ipv4.sysctl_tcp_ecn = 2;
3089 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3090 
3091 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3092 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3093 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3094 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3095 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3096 
3097 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3098 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3099 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3100 
3101 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3102 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3103 	net->ipv4.sysctl_tcp_syncookies = 1;
3104 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3105 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3106 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3107 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3108 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3109 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3110 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3111 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3112 
3113 	net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3114 	if (!net->ipv4.tcp_death_row)
3115 		return -ENOMEM;
3116 	refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3117 	cnt = tcp_hashinfo.ehash_mask + 1;
3118 	net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3119 	net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3120 
3121 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3122 	net->ipv4.sysctl_tcp_sack = 1;
3123 	net->ipv4.sysctl_tcp_window_scaling = 1;
3124 	net->ipv4.sysctl_tcp_timestamps = 1;
3125 	net->ipv4.sysctl_tcp_early_retrans = 3;
3126 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3127 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3128 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3129 	net->ipv4.sysctl_tcp_max_reordering = 300;
3130 	net->ipv4.sysctl_tcp_dsack = 1;
3131 	net->ipv4.sysctl_tcp_app_win = 31;
3132 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3133 	net->ipv4.sysctl_tcp_frto = 2;
3134 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3135 	/* This limits the percentage of the congestion window which we
3136 	 * will allow a single TSO frame to consume.  Building TSO frames
3137 	 * which are too large can cause TCP streams to be bursty.
3138 	 */
3139 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3140 	/* Default TSQ limit of 16 TSO segments */
3141 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3142 
3143 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3144 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3145 
3146 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3147 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3148 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3149 	net->ipv4.sysctl_tcp_autocorking = 1;
3150 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3151 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3152 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3153 	if (net != &init_net) {
3154 		memcpy(net->ipv4.sysctl_tcp_rmem,
3155 		       init_net.ipv4.sysctl_tcp_rmem,
3156 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3157 		memcpy(net->ipv4.sysctl_tcp_wmem,
3158 		       init_net.ipv4.sysctl_tcp_wmem,
3159 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3160 	}
3161 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3162 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3163 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3164 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3165 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3166 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3167 
3168 	/* Reno is always built in */
3169 	if (!net_eq(net, &init_net) &&
3170 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3171 			       init_net.ipv4.tcp_congestion_control->owner))
3172 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3173 	else
3174 		net->ipv4.tcp_congestion_control = &tcp_reno;
3175 
3176 	return 0;
3177 }
3178 
3179 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3180 {
3181 	struct net *net;
3182 
3183 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
3184 
3185 	list_for_each_entry(net, net_exit_list, exit_list)
3186 		tcp_fastopen_ctx_destroy(net);
3187 }
3188 
3189 static struct pernet_operations __net_initdata tcp_sk_ops = {
3190        .init	   = tcp_sk_init,
3191        .exit	   = tcp_sk_exit,
3192        .exit_batch = tcp_sk_exit_batch,
3193 };
3194 
3195 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3196 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3197 		     struct sock_common *sk_common, uid_t uid)
3198 
3199 #define INIT_BATCH_SZ 16
3200 
3201 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3202 {
3203 	struct bpf_tcp_iter_state *iter = priv_data;
3204 	int err;
3205 
3206 	err = bpf_iter_init_seq_net(priv_data, aux);
3207 	if (err)
3208 		return err;
3209 
3210 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3211 	if (err) {
3212 		bpf_iter_fini_seq_net(priv_data);
3213 		return err;
3214 	}
3215 
3216 	return 0;
3217 }
3218 
3219 static void bpf_iter_fini_tcp(void *priv_data)
3220 {
3221 	struct bpf_tcp_iter_state *iter = priv_data;
3222 
3223 	bpf_iter_fini_seq_net(priv_data);
3224 	kvfree(iter->batch);
3225 }
3226 
3227 static const struct bpf_iter_seq_info tcp_seq_info = {
3228 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3229 	.init_seq_private	= bpf_iter_init_tcp,
3230 	.fini_seq_private	= bpf_iter_fini_tcp,
3231 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3232 };
3233 
3234 static const struct bpf_func_proto *
3235 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3236 			    const struct bpf_prog *prog)
3237 {
3238 	switch (func_id) {
3239 	case BPF_FUNC_setsockopt:
3240 		return &bpf_sk_setsockopt_proto;
3241 	case BPF_FUNC_getsockopt:
3242 		return &bpf_sk_getsockopt_proto;
3243 	default:
3244 		return NULL;
3245 	}
3246 }
3247 
3248 static struct bpf_iter_reg tcp_reg_info = {
3249 	.target			= "tcp",
3250 	.ctx_arg_info_size	= 1,
3251 	.ctx_arg_info		= {
3252 		{ offsetof(struct bpf_iter__tcp, sk_common),
3253 		  PTR_TO_BTF_ID_OR_NULL },
3254 	},
3255 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3256 	.seq_info		= &tcp_seq_info,
3257 };
3258 
3259 static void __init bpf_iter_register(void)
3260 {
3261 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3262 	if (bpf_iter_reg_target(&tcp_reg_info))
3263 		pr_warn("Warning: could not register bpf iterator tcp\n");
3264 }
3265 
3266 #endif
3267 
3268 void __init tcp_v4_init(void)
3269 {
3270 	int cpu, res;
3271 
3272 	for_each_possible_cpu(cpu) {
3273 		struct sock *sk;
3274 
3275 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3276 					   IPPROTO_TCP, &init_net);
3277 		if (res)
3278 			panic("Failed to create the TCP control socket.\n");
3279 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3280 
3281 		/* Please enforce IP_DF and IPID==0 for RST and
3282 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3283 		 */
3284 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3285 
3286 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3287 	}
3288 	if (register_pernet_subsys(&tcp_sk_ops))
3289 		panic("Failed to create the TCP control socket.\n");
3290 
3291 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3292 	bpf_iter_register();
3293 #endif
3294 }
3295