xref: /linux/net/ipv4/tcp_ipv4.c (revision 2aceb896ee18ae35b21b14c978d8c2ef8c7b439d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81 
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84 
85 #include <trace/events/tcp.h>
86 
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91 
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94 
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96 
97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99 	return secure_tcp_seq(ip_hdr(skb)->daddr,
100 			      ip_hdr(skb)->saddr,
101 			      tcp_hdr(skb)->dest,
102 			      tcp_hdr(skb)->source);
103 }
104 
105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 {
107 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108 }
109 
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
114 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115 	struct tcp_sock *tp = tcp_sk(sk);
116 
117 	if (reuse == 2) {
118 		/* Still does not detect *everything* that goes through
119 		 * lo, since we require a loopback src or dst address
120 		 * or direct binding to 'lo' interface.
121 		 */
122 		bool loopback = false;
123 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124 			loopback = true;
125 #if IS_ENABLED(CONFIG_IPV6)
126 		if (tw->tw_family == AF_INET6) {
127 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131 				loopback = true;
132 		} else
133 #endif
134 		{
135 			if (ipv4_is_loopback(tw->tw_daddr) ||
136 			    ipv4_is_loopback(tw->tw_rcv_saddr))
137 				loopback = true;
138 		}
139 		if (!loopback)
140 			reuse = 0;
141 	}
142 
143 	/* With PAWS, it is safe from the viewpoint
144 	   of data integrity. Even without PAWS it is safe provided sequence
145 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146 
147 	   Actually, the idea is close to VJ's one, only timestamp cache is
148 	   held not per host, but per port pair and TW bucket is used as state
149 	   holder.
150 
151 	   If TW bucket has been already destroyed we fall back to VJ's scheme
152 	   and use initial timestamp retrieved from peer table.
153 	 */
154 	if (tcptw->tw_ts_recent_stamp &&
155 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
156 					    tcptw->tw_ts_recent_stamp)))) {
157 		/* In case of repair and re-using TIME-WAIT sockets we still
158 		 * want to be sure that it is safe as above but honor the
159 		 * sequence numbers and time stamps set as part of the repair
160 		 * process.
161 		 *
162 		 * Without this check re-using a TIME-WAIT socket with TCP
163 		 * repair would accumulate a -1 on the repair assigned
164 		 * sequence number. The first time it is reused the sequence
165 		 * is -1, the second time -2, etc. This fixes that issue
166 		 * without appearing to create any others.
167 		 */
168 		if (likely(!tp->repair)) {
169 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170 
171 			if (!seq)
172 				seq = 1;
173 			WRITE_ONCE(tp->write_seq, seq);
174 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
175 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
176 		}
177 		sock_hold(sktw);
178 		return 1;
179 	}
180 
181 	return 0;
182 }
183 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184 
185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
186 			      int addr_len)
187 {
188 	/* This check is replicated from tcp_v4_connect() and intended to
189 	 * prevent BPF program called below from accessing bytes that are out
190 	 * of the bound specified by user in addr_len.
191 	 */
192 	if (addr_len < sizeof(struct sockaddr_in))
193 		return -EINVAL;
194 
195 	sock_owned_by_me(sk);
196 
197 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
198 }
199 
200 /* This will initiate an outgoing connection. */
201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 {
203 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204 	struct inet_timewait_death_row *tcp_death_row;
205 	struct inet_sock *inet = inet_sk(sk);
206 	struct tcp_sock *tp = tcp_sk(sk);
207 	struct ip_options_rcu *inet_opt;
208 	struct net *net = sock_net(sk);
209 	__be16 orig_sport, orig_dport;
210 	__be32 daddr, nexthop;
211 	struct flowi4 *fl4;
212 	struct rtable *rt;
213 	int err;
214 
215 	if (addr_len < sizeof(struct sockaddr_in))
216 		return -EINVAL;
217 
218 	if (usin->sin_family != AF_INET)
219 		return -EAFNOSUPPORT;
220 
221 	nexthop = daddr = usin->sin_addr.s_addr;
222 	inet_opt = rcu_dereference_protected(inet->inet_opt,
223 					     lockdep_sock_is_held(sk));
224 	if (inet_opt && inet_opt->opt.srr) {
225 		if (!daddr)
226 			return -EINVAL;
227 		nexthop = inet_opt->opt.faddr;
228 	}
229 
230 	orig_sport = inet->inet_sport;
231 	orig_dport = usin->sin_port;
232 	fl4 = &inet->cork.fl.u.ip4;
233 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235 			      orig_dport, sk);
236 	if (IS_ERR(rt)) {
237 		err = PTR_ERR(rt);
238 		if (err == -ENETUNREACH)
239 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240 		return err;
241 	}
242 
243 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244 		ip_rt_put(rt);
245 		return -ENETUNREACH;
246 	}
247 
248 	if (!inet_opt || !inet_opt->opt.srr)
249 		daddr = fl4->daddr;
250 
251 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252 
253 	if (!inet->inet_saddr) {
254 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
255 		if (err) {
256 			ip_rt_put(rt);
257 			return err;
258 		}
259 	} else {
260 		sk_rcv_saddr_set(sk, inet->inet_saddr);
261 	}
262 
263 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264 		/* Reset inherited state */
265 		tp->rx_opt.ts_recent	   = 0;
266 		tp->rx_opt.ts_recent_stamp = 0;
267 		if (likely(!tp->repair))
268 			WRITE_ONCE(tp->write_seq, 0);
269 	}
270 
271 	inet->inet_dport = usin->sin_port;
272 	sk_daddr_set(sk, daddr);
273 
274 	inet_csk(sk)->icsk_ext_hdr_len = 0;
275 	if (inet_opt)
276 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277 
278 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279 
280 	/* Socket identity is still unknown (sport may be zero).
281 	 * However we set state to SYN-SENT and not releasing socket
282 	 * lock select source port, enter ourselves into the hash tables and
283 	 * complete initialization after this.
284 	 */
285 	tcp_set_state(sk, TCP_SYN_SENT);
286 	err = inet_hash_connect(tcp_death_row, sk);
287 	if (err)
288 		goto failure;
289 
290 	sk_set_txhash(sk);
291 
292 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
293 			       inet->inet_sport, inet->inet_dport, sk);
294 	if (IS_ERR(rt)) {
295 		err = PTR_ERR(rt);
296 		rt = NULL;
297 		goto failure;
298 	}
299 	/* OK, now commit destination to socket.  */
300 	sk->sk_gso_type = SKB_GSO_TCPV4;
301 	sk_setup_caps(sk, &rt->dst);
302 	rt = NULL;
303 
304 	if (likely(!tp->repair)) {
305 		if (!tp->write_seq)
306 			WRITE_ONCE(tp->write_seq,
307 				   secure_tcp_seq(inet->inet_saddr,
308 						  inet->inet_daddr,
309 						  inet->inet_sport,
310 						  usin->sin_port));
311 		WRITE_ONCE(tp->tsoffset,
312 			   secure_tcp_ts_off(net, inet->inet_saddr,
313 					     inet->inet_daddr));
314 	}
315 
316 	atomic_set(&inet->inet_id, get_random_u16());
317 
318 	if (tcp_fastopen_defer_connect(sk, &err))
319 		return err;
320 	if (err)
321 		goto failure;
322 
323 	err = tcp_connect(sk);
324 
325 	if (err)
326 		goto failure;
327 
328 	return 0;
329 
330 failure:
331 	/*
332 	 * This unhashes the socket and releases the local port,
333 	 * if necessary.
334 	 */
335 	tcp_set_state(sk, TCP_CLOSE);
336 	inet_bhash2_reset_saddr(sk);
337 	ip_rt_put(rt);
338 	sk->sk_route_caps = 0;
339 	inet->inet_dport = 0;
340 	return err;
341 }
342 EXPORT_SYMBOL(tcp_v4_connect);
343 
344 /*
345  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
346  * It can be called through tcp_release_cb() if socket was owned by user
347  * at the time tcp_v4_err() was called to handle ICMP message.
348  */
349 void tcp_v4_mtu_reduced(struct sock *sk)
350 {
351 	struct inet_sock *inet = inet_sk(sk);
352 	struct dst_entry *dst;
353 	u32 mtu;
354 
355 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
356 		return;
357 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
358 	dst = inet_csk_update_pmtu(sk, mtu);
359 	if (!dst)
360 		return;
361 
362 	/* Something is about to be wrong... Remember soft error
363 	 * for the case, if this connection will not able to recover.
364 	 */
365 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
366 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
367 
368 	mtu = dst_mtu(dst);
369 
370 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
371 	    ip_sk_accept_pmtu(sk) &&
372 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
373 		tcp_sync_mss(sk, mtu);
374 
375 		/* Resend the TCP packet because it's
376 		 * clear that the old packet has been
377 		 * dropped. This is the new "fast" path mtu
378 		 * discovery.
379 		 */
380 		tcp_simple_retransmit(sk);
381 	} /* else let the usual retransmit timer handle it */
382 }
383 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
384 
385 static void do_redirect(struct sk_buff *skb, struct sock *sk)
386 {
387 	struct dst_entry *dst = __sk_dst_check(sk, 0);
388 
389 	if (dst)
390 		dst->ops->redirect(dst, sk, skb);
391 }
392 
393 
394 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
395 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
396 {
397 	struct request_sock *req = inet_reqsk(sk);
398 	struct net *net = sock_net(sk);
399 
400 	/* ICMPs are not backlogged, hence we cannot get
401 	 * an established socket here.
402 	 */
403 	if (seq != tcp_rsk(req)->snt_isn) {
404 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
405 	} else if (abort) {
406 		/*
407 		 * Still in SYN_RECV, just remove it silently.
408 		 * There is no good way to pass the error to the newly
409 		 * created socket, and POSIX does not want network
410 		 * errors returned from accept().
411 		 */
412 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
413 		tcp_listendrop(req->rsk_listener);
414 	}
415 	reqsk_put(req);
416 }
417 EXPORT_SYMBOL(tcp_req_err);
418 
419 /* TCP-LD (RFC 6069) logic */
420 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
421 {
422 	struct inet_connection_sock *icsk = inet_csk(sk);
423 	struct tcp_sock *tp = tcp_sk(sk);
424 	struct sk_buff *skb;
425 	s32 remaining;
426 	u32 delta_us;
427 
428 	if (sock_owned_by_user(sk))
429 		return;
430 
431 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
432 	    !icsk->icsk_backoff)
433 		return;
434 
435 	skb = tcp_rtx_queue_head(sk);
436 	if (WARN_ON_ONCE(!skb))
437 		return;
438 
439 	icsk->icsk_backoff--;
440 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
441 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
442 
443 	tcp_mstamp_refresh(tp);
444 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
445 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
446 
447 	if (remaining > 0) {
448 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
449 					  remaining, TCP_RTO_MAX);
450 	} else {
451 		/* RTO revert clocked out retransmission.
452 		 * Will retransmit now.
453 		 */
454 		tcp_retransmit_timer(sk);
455 	}
456 }
457 EXPORT_SYMBOL(tcp_ld_RTO_revert);
458 
459 /*
460  * This routine is called by the ICMP module when it gets some
461  * sort of error condition.  If err < 0 then the socket should
462  * be closed and the error returned to the user.  If err > 0
463  * it's just the icmp type << 8 | icmp code.  After adjustment
464  * header points to the first 8 bytes of the tcp header.  We need
465  * to find the appropriate port.
466  *
467  * The locking strategy used here is very "optimistic". When
468  * someone else accesses the socket the ICMP is just dropped
469  * and for some paths there is no check at all.
470  * A more general error queue to queue errors for later handling
471  * is probably better.
472  *
473  */
474 
475 int tcp_v4_err(struct sk_buff *skb, u32 info)
476 {
477 	const struct iphdr *iph = (const struct iphdr *)skb->data;
478 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
479 	struct tcp_sock *tp;
480 	const int type = icmp_hdr(skb)->type;
481 	const int code = icmp_hdr(skb)->code;
482 	struct sock *sk;
483 	struct request_sock *fastopen;
484 	u32 seq, snd_una;
485 	int err;
486 	struct net *net = dev_net(skb->dev);
487 
488 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
489 				       iph->daddr, th->dest, iph->saddr,
490 				       ntohs(th->source), inet_iif(skb), 0);
491 	if (!sk) {
492 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
493 		return -ENOENT;
494 	}
495 	if (sk->sk_state == TCP_TIME_WAIT) {
496 		inet_twsk_put(inet_twsk(sk));
497 		return 0;
498 	}
499 	seq = ntohl(th->seq);
500 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
501 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
502 				     type == ICMP_TIME_EXCEEDED ||
503 				     (type == ICMP_DEST_UNREACH &&
504 				      (code == ICMP_NET_UNREACH ||
505 				       code == ICMP_HOST_UNREACH)));
506 		return 0;
507 	}
508 
509 	bh_lock_sock(sk);
510 	/* If too many ICMPs get dropped on busy
511 	 * servers this needs to be solved differently.
512 	 * We do take care of PMTU discovery (RFC1191) special case :
513 	 * we can receive locally generated ICMP messages while socket is held.
514 	 */
515 	if (sock_owned_by_user(sk)) {
516 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
517 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
518 	}
519 	if (sk->sk_state == TCP_CLOSE)
520 		goto out;
521 
522 	if (static_branch_unlikely(&ip4_min_ttl)) {
523 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
524 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
525 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
526 			goto out;
527 		}
528 	}
529 
530 	tp = tcp_sk(sk);
531 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
532 	fastopen = rcu_dereference(tp->fastopen_rsk);
533 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
534 	if (sk->sk_state != TCP_LISTEN &&
535 	    !between(seq, snd_una, tp->snd_nxt)) {
536 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
537 		goto out;
538 	}
539 
540 	switch (type) {
541 	case ICMP_REDIRECT:
542 		if (!sock_owned_by_user(sk))
543 			do_redirect(skb, sk);
544 		goto out;
545 	case ICMP_SOURCE_QUENCH:
546 		/* Just silently ignore these. */
547 		goto out;
548 	case ICMP_PARAMETERPROB:
549 		err = EPROTO;
550 		break;
551 	case ICMP_DEST_UNREACH:
552 		if (code > NR_ICMP_UNREACH)
553 			goto out;
554 
555 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
556 			/* We are not interested in TCP_LISTEN and open_requests
557 			 * (SYN-ACKs send out by Linux are always <576bytes so
558 			 * they should go through unfragmented).
559 			 */
560 			if (sk->sk_state == TCP_LISTEN)
561 				goto out;
562 
563 			WRITE_ONCE(tp->mtu_info, info);
564 			if (!sock_owned_by_user(sk)) {
565 				tcp_v4_mtu_reduced(sk);
566 			} else {
567 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
568 					sock_hold(sk);
569 			}
570 			goto out;
571 		}
572 
573 		err = icmp_err_convert[code].errno;
574 		/* check if this ICMP message allows revert of backoff.
575 		 * (see RFC 6069)
576 		 */
577 		if (!fastopen &&
578 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
579 			tcp_ld_RTO_revert(sk, seq);
580 		break;
581 	case ICMP_TIME_EXCEEDED:
582 		err = EHOSTUNREACH;
583 		break;
584 	default:
585 		goto out;
586 	}
587 
588 	switch (sk->sk_state) {
589 	case TCP_SYN_SENT:
590 	case TCP_SYN_RECV:
591 		/* Only in fast or simultaneous open. If a fast open socket is
592 		 * already accepted it is treated as a connected one below.
593 		 */
594 		if (fastopen && !fastopen->sk)
595 			break;
596 
597 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
598 
599 		if (!sock_owned_by_user(sk)) {
600 			WRITE_ONCE(sk->sk_err, err);
601 
602 			sk_error_report(sk);
603 
604 			tcp_done(sk);
605 		} else {
606 			WRITE_ONCE(sk->sk_err_soft, err);
607 		}
608 		goto out;
609 	}
610 
611 	/* If we've already connected we will keep trying
612 	 * until we time out, or the user gives up.
613 	 *
614 	 * rfc1122 4.2.3.9 allows to consider as hard errors
615 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
616 	 * but it is obsoleted by pmtu discovery).
617 	 *
618 	 * Note, that in modern internet, where routing is unreliable
619 	 * and in each dark corner broken firewalls sit, sending random
620 	 * errors ordered by their masters even this two messages finally lose
621 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
622 	 *
623 	 * Now we are in compliance with RFCs.
624 	 *							--ANK (980905)
625 	 */
626 
627 	if (!sock_owned_by_user(sk) &&
628 	    inet_test_bit(RECVERR, sk)) {
629 		WRITE_ONCE(sk->sk_err, err);
630 		sk_error_report(sk);
631 	} else	{ /* Only an error on timeout */
632 		WRITE_ONCE(sk->sk_err_soft, err);
633 	}
634 
635 out:
636 	bh_unlock_sock(sk);
637 	sock_put(sk);
638 	return 0;
639 }
640 
641 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
642 {
643 	struct tcphdr *th = tcp_hdr(skb);
644 
645 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
646 	skb->csum_start = skb_transport_header(skb) - skb->head;
647 	skb->csum_offset = offsetof(struct tcphdr, check);
648 }
649 
650 /* This routine computes an IPv4 TCP checksum. */
651 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
652 {
653 	const struct inet_sock *inet = inet_sk(sk);
654 
655 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
656 }
657 EXPORT_SYMBOL(tcp_v4_send_check);
658 
659 /*
660  *	This routine will send an RST to the other tcp.
661  *
662  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
663  *		      for reset.
664  *	Answer: if a packet caused RST, it is not for a socket
665  *		existing in our system, if it is matched to a socket,
666  *		it is just duplicate segment or bug in other side's TCP.
667  *		So that we build reply only basing on parameters
668  *		arrived with segment.
669  *	Exception: precedence violation. We do not implement it in any case.
670  */
671 
672 #ifdef CONFIG_TCP_MD5SIG
673 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
674 #else
675 #define OPTION_BYTES sizeof(__be32)
676 #endif
677 
678 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
679 {
680 	const struct tcphdr *th = tcp_hdr(skb);
681 	struct {
682 		struct tcphdr th;
683 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
684 	} rep;
685 	struct ip_reply_arg arg;
686 #ifdef CONFIG_TCP_MD5SIG
687 	struct tcp_md5sig_key *key = NULL;
688 	const __u8 *hash_location = NULL;
689 	unsigned char newhash[16];
690 	int genhash;
691 	struct sock *sk1 = NULL;
692 #endif
693 	u64 transmit_time = 0;
694 	struct sock *ctl_sk;
695 	struct net *net;
696 	u32 txhash = 0;
697 
698 	/* Never send a reset in response to a reset. */
699 	if (th->rst)
700 		return;
701 
702 	/* If sk not NULL, it means we did a successful lookup and incoming
703 	 * route had to be correct. prequeue might have dropped our dst.
704 	 */
705 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
706 		return;
707 
708 	/* Swap the send and the receive. */
709 	memset(&rep, 0, sizeof(rep));
710 	rep.th.dest   = th->source;
711 	rep.th.source = th->dest;
712 	rep.th.doff   = sizeof(struct tcphdr) / 4;
713 	rep.th.rst    = 1;
714 
715 	if (th->ack) {
716 		rep.th.seq = th->ack_seq;
717 	} else {
718 		rep.th.ack = 1;
719 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
720 				       skb->len - (th->doff << 2));
721 	}
722 
723 	memset(&arg, 0, sizeof(arg));
724 	arg.iov[0].iov_base = (unsigned char *)&rep;
725 	arg.iov[0].iov_len  = sizeof(rep.th);
726 
727 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
728 #ifdef CONFIG_TCP_MD5SIG
729 	rcu_read_lock();
730 	hash_location = tcp_parse_md5sig_option(th);
731 	if (sk && sk_fullsock(sk)) {
732 		const union tcp_md5_addr *addr;
733 		int l3index;
734 
735 		/* sdif set, means packet ingressed via a device
736 		 * in an L3 domain and inet_iif is set to it.
737 		 */
738 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
739 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
740 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
741 	} else if (hash_location) {
742 		const union tcp_md5_addr *addr;
743 		int sdif = tcp_v4_sdif(skb);
744 		int dif = inet_iif(skb);
745 		int l3index;
746 
747 		/*
748 		 * active side is lost. Try to find listening socket through
749 		 * source port, and then find md5 key through listening socket.
750 		 * we are not loose security here:
751 		 * Incoming packet is checked with md5 hash with finding key,
752 		 * no RST generated if md5 hash doesn't match.
753 		 */
754 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
755 					     NULL, 0, ip_hdr(skb)->saddr,
756 					     th->source, ip_hdr(skb)->daddr,
757 					     ntohs(th->source), dif, sdif);
758 		/* don't send rst if it can't find key */
759 		if (!sk1)
760 			goto out;
761 
762 		/* sdif set, means packet ingressed via a device
763 		 * in an L3 domain and dif is set to it.
764 		 */
765 		l3index = sdif ? dif : 0;
766 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
767 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
768 		if (!key)
769 			goto out;
770 
771 
772 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
773 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
774 			goto out;
775 
776 	}
777 
778 	if (key) {
779 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
780 				   (TCPOPT_NOP << 16) |
781 				   (TCPOPT_MD5SIG << 8) |
782 				   TCPOLEN_MD5SIG);
783 		/* Update length and the length the header thinks exists */
784 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
785 		rep.th.doff = arg.iov[0].iov_len / 4;
786 
787 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
788 				     key, ip_hdr(skb)->saddr,
789 				     ip_hdr(skb)->daddr, &rep.th);
790 	}
791 #endif
792 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
793 	if (rep.opt[0] == 0) {
794 		__be32 mrst = mptcp_reset_option(skb);
795 
796 		if (mrst) {
797 			rep.opt[0] = mrst;
798 			arg.iov[0].iov_len += sizeof(mrst);
799 			rep.th.doff = arg.iov[0].iov_len / 4;
800 		}
801 	}
802 
803 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
804 				      ip_hdr(skb)->saddr, /* XXX */
805 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
806 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
807 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
808 
809 	/* When socket is gone, all binding information is lost.
810 	 * routing might fail in this case. No choice here, if we choose to force
811 	 * input interface, we will misroute in case of asymmetric route.
812 	 */
813 	if (sk) {
814 		arg.bound_dev_if = sk->sk_bound_dev_if;
815 		if (sk_fullsock(sk))
816 			trace_tcp_send_reset(sk, skb);
817 	}
818 
819 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
820 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
821 
822 	arg.tos = ip_hdr(skb)->tos;
823 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
824 	local_bh_disable();
825 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
826 	sock_net_set(ctl_sk, net);
827 	if (sk) {
828 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
829 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
830 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
831 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
832 		transmit_time = tcp_transmit_time(sk);
833 		xfrm_sk_clone_policy(ctl_sk, sk);
834 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
835 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
836 	} else {
837 		ctl_sk->sk_mark = 0;
838 		ctl_sk->sk_priority = 0;
839 	}
840 	ip_send_unicast_reply(ctl_sk,
841 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
842 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
843 			      &arg, arg.iov[0].iov_len,
844 			      transmit_time, txhash);
845 
846 	xfrm_sk_free_policy(ctl_sk);
847 	sock_net_set(ctl_sk, &init_net);
848 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
849 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
850 	local_bh_enable();
851 
852 #ifdef CONFIG_TCP_MD5SIG
853 out:
854 	rcu_read_unlock();
855 #endif
856 }
857 
858 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
859    outside socket context is ugly, certainly. What can I do?
860  */
861 
862 static void tcp_v4_send_ack(const struct sock *sk,
863 			    struct sk_buff *skb, u32 seq, u32 ack,
864 			    u32 win, u32 tsval, u32 tsecr, int oif,
865 			    struct tcp_md5sig_key *key,
866 			    int reply_flags, u8 tos, u32 txhash)
867 {
868 	const struct tcphdr *th = tcp_hdr(skb);
869 	struct {
870 		struct tcphdr th;
871 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
872 #ifdef CONFIG_TCP_MD5SIG
873 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
874 #endif
875 			];
876 	} rep;
877 	struct net *net = sock_net(sk);
878 	struct ip_reply_arg arg;
879 	struct sock *ctl_sk;
880 	u64 transmit_time;
881 
882 	memset(&rep.th, 0, sizeof(struct tcphdr));
883 	memset(&arg, 0, sizeof(arg));
884 
885 	arg.iov[0].iov_base = (unsigned char *)&rep;
886 	arg.iov[0].iov_len  = sizeof(rep.th);
887 	if (tsecr) {
888 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
889 				   (TCPOPT_TIMESTAMP << 8) |
890 				   TCPOLEN_TIMESTAMP);
891 		rep.opt[1] = htonl(tsval);
892 		rep.opt[2] = htonl(tsecr);
893 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
894 	}
895 
896 	/* Swap the send and the receive. */
897 	rep.th.dest    = th->source;
898 	rep.th.source  = th->dest;
899 	rep.th.doff    = arg.iov[0].iov_len / 4;
900 	rep.th.seq     = htonl(seq);
901 	rep.th.ack_seq = htonl(ack);
902 	rep.th.ack     = 1;
903 	rep.th.window  = htons(win);
904 
905 #ifdef CONFIG_TCP_MD5SIG
906 	if (key) {
907 		int offset = (tsecr) ? 3 : 0;
908 
909 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
910 					  (TCPOPT_NOP << 16) |
911 					  (TCPOPT_MD5SIG << 8) |
912 					  TCPOLEN_MD5SIG);
913 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
914 		rep.th.doff = arg.iov[0].iov_len/4;
915 
916 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
917 				    key, ip_hdr(skb)->saddr,
918 				    ip_hdr(skb)->daddr, &rep.th);
919 	}
920 #endif
921 	arg.flags = reply_flags;
922 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
923 				      ip_hdr(skb)->saddr, /* XXX */
924 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
925 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
926 	if (oif)
927 		arg.bound_dev_if = oif;
928 	arg.tos = tos;
929 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
930 	local_bh_disable();
931 	ctl_sk = this_cpu_read(ipv4_tcp_sk);
932 	sock_net_set(ctl_sk, net);
933 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
934 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
935 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
936 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
937 	transmit_time = tcp_transmit_time(sk);
938 	ip_send_unicast_reply(ctl_sk,
939 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
940 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
941 			      &arg, arg.iov[0].iov_len,
942 			      transmit_time, txhash);
943 
944 	sock_net_set(ctl_sk, &init_net);
945 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
946 	local_bh_enable();
947 }
948 
949 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
950 {
951 	struct inet_timewait_sock *tw = inet_twsk(sk);
952 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
953 
954 	tcp_v4_send_ack(sk, skb,
955 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
956 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
957 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
958 			tcptw->tw_ts_recent,
959 			tw->tw_bound_dev_if,
960 			tcp_twsk_md5_key(tcptw),
961 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
962 			tw->tw_tos,
963 			tw->tw_txhash
964 			);
965 
966 	inet_twsk_put(tw);
967 }
968 
969 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
970 				  struct request_sock *req)
971 {
972 	const union tcp_md5_addr *addr;
973 	int l3index;
974 
975 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
976 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
977 	 */
978 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
979 					     tcp_sk(sk)->snd_nxt;
980 
981 	/* RFC 7323 2.3
982 	 * The window field (SEG.WND) of every outgoing segment, with the
983 	 * exception of <SYN> segments, MUST be right-shifted by
984 	 * Rcv.Wind.Shift bits:
985 	 */
986 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
987 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
988 	tcp_v4_send_ack(sk, skb, seq,
989 			tcp_rsk(req)->rcv_nxt,
990 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
991 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
992 			READ_ONCE(req->ts_recent),
993 			0,
994 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
995 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
996 			ip_hdr(skb)->tos,
997 			READ_ONCE(tcp_rsk(req)->txhash));
998 }
999 
1000 /*
1001  *	Send a SYN-ACK after having received a SYN.
1002  *	This still operates on a request_sock only, not on a big
1003  *	socket.
1004  */
1005 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1006 			      struct flowi *fl,
1007 			      struct request_sock *req,
1008 			      struct tcp_fastopen_cookie *foc,
1009 			      enum tcp_synack_type synack_type,
1010 			      struct sk_buff *syn_skb)
1011 {
1012 	const struct inet_request_sock *ireq = inet_rsk(req);
1013 	struct flowi4 fl4;
1014 	int err = -1;
1015 	struct sk_buff *skb;
1016 	u8 tos;
1017 
1018 	/* First, grab a route. */
1019 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1020 		return -1;
1021 
1022 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1023 
1024 	if (skb) {
1025 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1026 
1027 		tos = READ_ONCE(inet_sk(sk)->tos);
1028 
1029 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1030 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1031 			      (tos & INET_ECN_MASK);
1032 
1033 		if (!INET_ECN_is_capable(tos) &&
1034 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1035 			tos |= INET_ECN_ECT_0;
1036 
1037 		rcu_read_lock();
1038 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1039 					    ireq->ir_rmt_addr,
1040 					    rcu_dereference(ireq->ireq_opt),
1041 					    tos);
1042 		rcu_read_unlock();
1043 		err = net_xmit_eval(err);
1044 	}
1045 
1046 	return err;
1047 }
1048 
1049 /*
1050  *	IPv4 request_sock destructor.
1051  */
1052 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1053 {
1054 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1055 }
1056 
1057 #ifdef CONFIG_TCP_MD5SIG
1058 /*
1059  * RFC2385 MD5 checksumming requires a mapping of
1060  * IP address->MD5 Key.
1061  * We need to maintain these in the sk structure.
1062  */
1063 
1064 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1065 EXPORT_SYMBOL(tcp_md5_needed);
1066 
1067 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1068 {
1069 	if (!old)
1070 		return true;
1071 
1072 	/* l3index always overrides non-l3index */
1073 	if (old->l3index && new->l3index == 0)
1074 		return false;
1075 	if (old->l3index == 0 && new->l3index)
1076 		return true;
1077 
1078 	return old->prefixlen < new->prefixlen;
1079 }
1080 
1081 /* Find the Key structure for an address.  */
1082 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1083 					   const union tcp_md5_addr *addr,
1084 					   int family)
1085 {
1086 	const struct tcp_sock *tp = tcp_sk(sk);
1087 	struct tcp_md5sig_key *key;
1088 	const struct tcp_md5sig_info *md5sig;
1089 	__be32 mask;
1090 	struct tcp_md5sig_key *best_match = NULL;
1091 	bool match;
1092 
1093 	/* caller either holds rcu_read_lock() or socket lock */
1094 	md5sig = rcu_dereference_check(tp->md5sig_info,
1095 				       lockdep_sock_is_held(sk));
1096 	if (!md5sig)
1097 		return NULL;
1098 
1099 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1100 				 lockdep_sock_is_held(sk)) {
1101 		if (key->family != family)
1102 			continue;
1103 		if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1104 			continue;
1105 		if (family == AF_INET) {
1106 			mask = inet_make_mask(key->prefixlen);
1107 			match = (key->addr.a4.s_addr & mask) ==
1108 				(addr->a4.s_addr & mask);
1109 #if IS_ENABLED(CONFIG_IPV6)
1110 		} else if (family == AF_INET6) {
1111 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1112 						  key->prefixlen);
1113 #endif
1114 		} else {
1115 			match = false;
1116 		}
1117 
1118 		if (match && better_md5_match(best_match, key))
1119 			best_match = key;
1120 	}
1121 	return best_match;
1122 }
1123 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1124 
1125 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1126 						      const union tcp_md5_addr *addr,
1127 						      int family, u8 prefixlen,
1128 						      int l3index, u8 flags)
1129 {
1130 	const struct tcp_sock *tp = tcp_sk(sk);
1131 	struct tcp_md5sig_key *key;
1132 	unsigned int size = sizeof(struct in_addr);
1133 	const struct tcp_md5sig_info *md5sig;
1134 
1135 	/* caller either holds rcu_read_lock() or socket lock */
1136 	md5sig = rcu_dereference_check(tp->md5sig_info,
1137 				       lockdep_sock_is_held(sk));
1138 	if (!md5sig)
1139 		return NULL;
1140 #if IS_ENABLED(CONFIG_IPV6)
1141 	if (family == AF_INET6)
1142 		size = sizeof(struct in6_addr);
1143 #endif
1144 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1145 				 lockdep_sock_is_held(sk)) {
1146 		if (key->family != family)
1147 			continue;
1148 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1149 			continue;
1150 		if (key->l3index != l3index)
1151 			continue;
1152 		if (!memcmp(&key->addr, addr, size) &&
1153 		    key->prefixlen == prefixlen)
1154 			return key;
1155 	}
1156 	return NULL;
1157 }
1158 
1159 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1160 					 const struct sock *addr_sk)
1161 {
1162 	const union tcp_md5_addr *addr;
1163 	int l3index;
1164 
1165 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1166 						 addr_sk->sk_bound_dev_if);
1167 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1168 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1169 }
1170 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1171 
1172 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1173 {
1174 	struct tcp_sock *tp = tcp_sk(sk);
1175 	struct tcp_md5sig_info *md5sig;
1176 
1177 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1178 	if (!md5sig)
1179 		return -ENOMEM;
1180 
1181 	sk_gso_disable(sk);
1182 	INIT_HLIST_HEAD(&md5sig->head);
1183 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1184 	return 0;
1185 }
1186 
1187 /* This can be called on a newly created socket, from other files */
1188 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1189 			    int family, u8 prefixlen, int l3index, u8 flags,
1190 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1191 {
1192 	/* Add Key to the list */
1193 	struct tcp_md5sig_key *key;
1194 	struct tcp_sock *tp = tcp_sk(sk);
1195 	struct tcp_md5sig_info *md5sig;
1196 
1197 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1198 	if (key) {
1199 		/* Pre-existing entry - just update that one.
1200 		 * Note that the key might be used concurrently.
1201 		 * data_race() is telling kcsan that we do not care of
1202 		 * key mismatches, since changing MD5 key on live flows
1203 		 * can lead to packet drops.
1204 		 */
1205 		data_race(memcpy(key->key, newkey, newkeylen));
1206 
1207 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1208 		 * Also note that a reader could catch new key->keylen value
1209 		 * but old key->key[], this is the reason we use __GFP_ZERO
1210 		 * at sock_kmalloc() time below these lines.
1211 		 */
1212 		WRITE_ONCE(key->keylen, newkeylen);
1213 
1214 		return 0;
1215 	}
1216 
1217 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1218 					   lockdep_sock_is_held(sk));
1219 
1220 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1221 	if (!key)
1222 		return -ENOMEM;
1223 	if (!tcp_alloc_md5sig_pool()) {
1224 		sock_kfree_s(sk, key, sizeof(*key));
1225 		return -ENOMEM;
1226 	}
1227 
1228 	memcpy(key->key, newkey, newkeylen);
1229 	key->keylen = newkeylen;
1230 	key->family = family;
1231 	key->prefixlen = prefixlen;
1232 	key->l3index = l3index;
1233 	key->flags = flags;
1234 	memcpy(&key->addr, addr,
1235 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1236 								 sizeof(struct in_addr));
1237 	hlist_add_head_rcu(&key->node, &md5sig->head);
1238 	return 0;
1239 }
1240 
1241 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1242 		   int family, u8 prefixlen, int l3index, u8 flags,
1243 		   const u8 *newkey, u8 newkeylen)
1244 {
1245 	struct tcp_sock *tp = tcp_sk(sk);
1246 
1247 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1248 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1249 			return -ENOMEM;
1250 
1251 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1252 			struct tcp_md5sig_info *md5sig;
1253 
1254 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1255 			rcu_assign_pointer(tp->md5sig_info, NULL);
1256 			kfree_rcu(md5sig, rcu);
1257 			return -EUSERS;
1258 		}
1259 	}
1260 
1261 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1262 				newkey, newkeylen, GFP_KERNEL);
1263 }
1264 EXPORT_SYMBOL(tcp_md5_do_add);
1265 
1266 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1267 		     int family, u8 prefixlen, int l3index,
1268 		     struct tcp_md5sig_key *key)
1269 {
1270 	struct tcp_sock *tp = tcp_sk(sk);
1271 
1272 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1273 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1274 			return -ENOMEM;
1275 
1276 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1277 			struct tcp_md5sig_info *md5sig;
1278 
1279 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1280 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1281 			rcu_assign_pointer(tp->md5sig_info, NULL);
1282 			kfree_rcu(md5sig, rcu);
1283 			return -EUSERS;
1284 		}
1285 	}
1286 
1287 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1288 				key->flags, key->key, key->keylen,
1289 				sk_gfp_mask(sk, GFP_ATOMIC));
1290 }
1291 EXPORT_SYMBOL(tcp_md5_key_copy);
1292 
1293 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1294 		   u8 prefixlen, int l3index, u8 flags)
1295 {
1296 	struct tcp_md5sig_key *key;
1297 
1298 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1299 	if (!key)
1300 		return -ENOENT;
1301 	hlist_del_rcu(&key->node);
1302 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1303 	kfree_rcu(key, rcu);
1304 	return 0;
1305 }
1306 EXPORT_SYMBOL(tcp_md5_do_del);
1307 
1308 static void tcp_clear_md5_list(struct sock *sk)
1309 {
1310 	struct tcp_sock *tp = tcp_sk(sk);
1311 	struct tcp_md5sig_key *key;
1312 	struct hlist_node *n;
1313 	struct tcp_md5sig_info *md5sig;
1314 
1315 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1316 
1317 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1318 		hlist_del_rcu(&key->node);
1319 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1320 		kfree_rcu(key, rcu);
1321 	}
1322 }
1323 
1324 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1325 				 sockptr_t optval, int optlen)
1326 {
1327 	struct tcp_md5sig cmd;
1328 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1329 	const union tcp_md5_addr *addr;
1330 	u8 prefixlen = 32;
1331 	int l3index = 0;
1332 	u8 flags;
1333 
1334 	if (optlen < sizeof(cmd))
1335 		return -EINVAL;
1336 
1337 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1338 		return -EFAULT;
1339 
1340 	if (sin->sin_family != AF_INET)
1341 		return -EINVAL;
1342 
1343 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1344 
1345 	if (optname == TCP_MD5SIG_EXT &&
1346 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1347 		prefixlen = cmd.tcpm_prefixlen;
1348 		if (prefixlen > 32)
1349 			return -EINVAL;
1350 	}
1351 
1352 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1353 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1354 		struct net_device *dev;
1355 
1356 		rcu_read_lock();
1357 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1358 		if (dev && netif_is_l3_master(dev))
1359 			l3index = dev->ifindex;
1360 
1361 		rcu_read_unlock();
1362 
1363 		/* ok to reference set/not set outside of rcu;
1364 		 * right now device MUST be an L3 master
1365 		 */
1366 		if (!dev || !l3index)
1367 			return -EINVAL;
1368 	}
1369 
1370 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1371 
1372 	if (!cmd.tcpm_keylen)
1373 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1374 
1375 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1376 		return -EINVAL;
1377 
1378 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1379 			      cmd.tcpm_key, cmd.tcpm_keylen);
1380 }
1381 
1382 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1383 				   __be32 daddr, __be32 saddr,
1384 				   const struct tcphdr *th, int nbytes)
1385 {
1386 	struct tcp4_pseudohdr *bp;
1387 	struct scatterlist sg;
1388 	struct tcphdr *_th;
1389 
1390 	bp = hp->scratch;
1391 	bp->saddr = saddr;
1392 	bp->daddr = daddr;
1393 	bp->pad = 0;
1394 	bp->protocol = IPPROTO_TCP;
1395 	bp->len = cpu_to_be16(nbytes);
1396 
1397 	_th = (struct tcphdr *)(bp + 1);
1398 	memcpy(_th, th, sizeof(*th));
1399 	_th->check = 0;
1400 
1401 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1402 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1403 				sizeof(*bp) + sizeof(*th));
1404 	return crypto_ahash_update(hp->md5_req);
1405 }
1406 
1407 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1408 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1409 {
1410 	struct tcp_md5sig_pool *hp;
1411 	struct ahash_request *req;
1412 
1413 	hp = tcp_get_md5sig_pool();
1414 	if (!hp)
1415 		goto clear_hash_noput;
1416 	req = hp->md5_req;
1417 
1418 	if (crypto_ahash_init(req))
1419 		goto clear_hash;
1420 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1421 		goto clear_hash;
1422 	if (tcp_md5_hash_key(hp, key))
1423 		goto clear_hash;
1424 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1425 	if (crypto_ahash_final(req))
1426 		goto clear_hash;
1427 
1428 	tcp_put_md5sig_pool();
1429 	return 0;
1430 
1431 clear_hash:
1432 	tcp_put_md5sig_pool();
1433 clear_hash_noput:
1434 	memset(md5_hash, 0, 16);
1435 	return 1;
1436 }
1437 
1438 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1439 			const struct sock *sk,
1440 			const struct sk_buff *skb)
1441 {
1442 	struct tcp_md5sig_pool *hp;
1443 	struct ahash_request *req;
1444 	const struct tcphdr *th = tcp_hdr(skb);
1445 	__be32 saddr, daddr;
1446 
1447 	if (sk) { /* valid for establish/request sockets */
1448 		saddr = sk->sk_rcv_saddr;
1449 		daddr = sk->sk_daddr;
1450 	} else {
1451 		const struct iphdr *iph = ip_hdr(skb);
1452 		saddr = iph->saddr;
1453 		daddr = iph->daddr;
1454 	}
1455 
1456 	hp = tcp_get_md5sig_pool();
1457 	if (!hp)
1458 		goto clear_hash_noput;
1459 	req = hp->md5_req;
1460 
1461 	if (crypto_ahash_init(req))
1462 		goto clear_hash;
1463 
1464 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1465 		goto clear_hash;
1466 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1467 		goto clear_hash;
1468 	if (tcp_md5_hash_key(hp, key))
1469 		goto clear_hash;
1470 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1471 	if (crypto_ahash_final(req))
1472 		goto clear_hash;
1473 
1474 	tcp_put_md5sig_pool();
1475 	return 0;
1476 
1477 clear_hash:
1478 	tcp_put_md5sig_pool();
1479 clear_hash_noput:
1480 	memset(md5_hash, 0, 16);
1481 	return 1;
1482 }
1483 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1484 
1485 #endif
1486 
1487 static void tcp_v4_init_req(struct request_sock *req,
1488 			    const struct sock *sk_listener,
1489 			    struct sk_buff *skb)
1490 {
1491 	struct inet_request_sock *ireq = inet_rsk(req);
1492 	struct net *net = sock_net(sk_listener);
1493 
1494 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1495 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1496 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1497 }
1498 
1499 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1500 					  struct sk_buff *skb,
1501 					  struct flowi *fl,
1502 					  struct request_sock *req)
1503 {
1504 	tcp_v4_init_req(req, sk, skb);
1505 
1506 	if (security_inet_conn_request(sk, skb, req))
1507 		return NULL;
1508 
1509 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1510 }
1511 
1512 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1513 	.family		=	PF_INET,
1514 	.obj_size	=	sizeof(struct tcp_request_sock),
1515 	.rtx_syn_ack	=	tcp_rtx_synack,
1516 	.send_ack	=	tcp_v4_reqsk_send_ack,
1517 	.destructor	=	tcp_v4_reqsk_destructor,
1518 	.send_reset	=	tcp_v4_send_reset,
1519 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1520 };
1521 
1522 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1523 	.mss_clamp	=	TCP_MSS_DEFAULT,
1524 #ifdef CONFIG_TCP_MD5SIG
1525 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1526 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1527 #endif
1528 #ifdef CONFIG_SYN_COOKIES
1529 	.cookie_init_seq =	cookie_v4_init_sequence,
1530 #endif
1531 	.route_req	=	tcp_v4_route_req,
1532 	.init_seq	=	tcp_v4_init_seq,
1533 	.init_ts_off	=	tcp_v4_init_ts_off,
1534 	.send_synack	=	tcp_v4_send_synack,
1535 };
1536 
1537 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1538 {
1539 	/* Never answer to SYNs send to broadcast or multicast */
1540 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1541 		goto drop;
1542 
1543 	return tcp_conn_request(&tcp_request_sock_ops,
1544 				&tcp_request_sock_ipv4_ops, sk, skb);
1545 
1546 drop:
1547 	tcp_listendrop(sk);
1548 	return 0;
1549 }
1550 EXPORT_SYMBOL(tcp_v4_conn_request);
1551 
1552 
1553 /*
1554  * The three way handshake has completed - we got a valid synack -
1555  * now create the new socket.
1556  */
1557 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1558 				  struct request_sock *req,
1559 				  struct dst_entry *dst,
1560 				  struct request_sock *req_unhash,
1561 				  bool *own_req)
1562 {
1563 	struct inet_request_sock *ireq;
1564 	bool found_dup_sk = false;
1565 	struct inet_sock *newinet;
1566 	struct tcp_sock *newtp;
1567 	struct sock *newsk;
1568 #ifdef CONFIG_TCP_MD5SIG
1569 	const union tcp_md5_addr *addr;
1570 	struct tcp_md5sig_key *key;
1571 	int l3index;
1572 #endif
1573 	struct ip_options_rcu *inet_opt;
1574 
1575 	if (sk_acceptq_is_full(sk))
1576 		goto exit_overflow;
1577 
1578 	newsk = tcp_create_openreq_child(sk, req, skb);
1579 	if (!newsk)
1580 		goto exit_nonewsk;
1581 
1582 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1583 	inet_sk_rx_dst_set(newsk, skb);
1584 
1585 	newtp		      = tcp_sk(newsk);
1586 	newinet		      = inet_sk(newsk);
1587 	ireq		      = inet_rsk(req);
1588 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1589 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1590 	newsk->sk_bound_dev_if = ireq->ir_iif;
1591 	newinet->inet_saddr   = ireq->ir_loc_addr;
1592 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1593 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1594 	newinet->mc_index     = inet_iif(skb);
1595 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1596 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1597 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1598 	if (inet_opt)
1599 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1600 	atomic_set(&newinet->inet_id, get_random_u16());
1601 
1602 	/* Set ToS of the new socket based upon the value of incoming SYN.
1603 	 * ECT bits are set later in tcp_init_transfer().
1604 	 */
1605 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1606 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1607 
1608 	if (!dst) {
1609 		dst = inet_csk_route_child_sock(sk, newsk, req);
1610 		if (!dst)
1611 			goto put_and_exit;
1612 	} else {
1613 		/* syncookie case : see end of cookie_v4_check() */
1614 	}
1615 	sk_setup_caps(newsk, dst);
1616 
1617 	tcp_ca_openreq_child(newsk, dst);
1618 
1619 	tcp_sync_mss(newsk, dst_mtu(dst));
1620 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1621 
1622 	tcp_initialize_rcv_mss(newsk);
1623 
1624 #ifdef CONFIG_TCP_MD5SIG
1625 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1626 	/* Copy over the MD5 key from the original socket */
1627 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1628 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1629 	if (key) {
1630 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1631 			goto put_and_exit;
1632 		sk_gso_disable(newsk);
1633 	}
1634 #endif
1635 
1636 	if (__inet_inherit_port(sk, newsk) < 0)
1637 		goto put_and_exit;
1638 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1639 				       &found_dup_sk);
1640 	if (likely(*own_req)) {
1641 		tcp_move_syn(newtp, req);
1642 		ireq->ireq_opt = NULL;
1643 	} else {
1644 		newinet->inet_opt = NULL;
1645 
1646 		if (!req_unhash && found_dup_sk) {
1647 			/* This code path should only be executed in the
1648 			 * syncookie case only
1649 			 */
1650 			bh_unlock_sock(newsk);
1651 			sock_put(newsk);
1652 			newsk = NULL;
1653 		}
1654 	}
1655 	return newsk;
1656 
1657 exit_overflow:
1658 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1659 exit_nonewsk:
1660 	dst_release(dst);
1661 exit:
1662 	tcp_listendrop(sk);
1663 	return NULL;
1664 put_and_exit:
1665 	newinet->inet_opt = NULL;
1666 	inet_csk_prepare_forced_close(newsk);
1667 	tcp_done(newsk);
1668 	goto exit;
1669 }
1670 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1671 
1672 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1673 {
1674 #ifdef CONFIG_SYN_COOKIES
1675 	const struct tcphdr *th = tcp_hdr(skb);
1676 
1677 	if (!th->syn)
1678 		sk = cookie_v4_check(sk, skb);
1679 #endif
1680 	return sk;
1681 }
1682 
1683 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1684 			 struct tcphdr *th, u32 *cookie)
1685 {
1686 	u16 mss = 0;
1687 #ifdef CONFIG_SYN_COOKIES
1688 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1689 				    &tcp_request_sock_ipv4_ops, sk, th);
1690 	if (mss) {
1691 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1692 		tcp_synq_overflow(sk);
1693 	}
1694 #endif
1695 	return mss;
1696 }
1697 
1698 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1699 							   u32));
1700 /* The socket must have it's spinlock held when we get
1701  * here, unless it is a TCP_LISTEN socket.
1702  *
1703  * We have a potential double-lock case here, so even when
1704  * doing backlog processing we use the BH locking scheme.
1705  * This is because we cannot sleep with the original spinlock
1706  * held.
1707  */
1708 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1709 {
1710 	enum skb_drop_reason reason;
1711 	struct sock *rsk;
1712 
1713 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1714 		struct dst_entry *dst;
1715 
1716 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1717 						lockdep_sock_is_held(sk));
1718 
1719 		sock_rps_save_rxhash(sk, skb);
1720 		sk_mark_napi_id(sk, skb);
1721 		if (dst) {
1722 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1723 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1724 					     dst, 0)) {
1725 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1726 				dst_release(dst);
1727 			}
1728 		}
1729 		tcp_rcv_established(sk, skb);
1730 		return 0;
1731 	}
1732 
1733 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
1734 	if (tcp_checksum_complete(skb))
1735 		goto csum_err;
1736 
1737 	if (sk->sk_state == TCP_LISTEN) {
1738 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1739 
1740 		if (!nsk)
1741 			goto discard;
1742 		if (nsk != sk) {
1743 			if (tcp_child_process(sk, nsk, skb)) {
1744 				rsk = nsk;
1745 				goto reset;
1746 			}
1747 			return 0;
1748 		}
1749 	} else
1750 		sock_rps_save_rxhash(sk, skb);
1751 
1752 	if (tcp_rcv_state_process(sk, skb)) {
1753 		rsk = sk;
1754 		goto reset;
1755 	}
1756 	return 0;
1757 
1758 reset:
1759 	tcp_v4_send_reset(rsk, skb);
1760 discard:
1761 	kfree_skb_reason(skb, reason);
1762 	/* Be careful here. If this function gets more complicated and
1763 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1764 	 * might be destroyed here. This current version compiles correctly,
1765 	 * but you have been warned.
1766 	 */
1767 	return 0;
1768 
1769 csum_err:
1770 	reason = SKB_DROP_REASON_TCP_CSUM;
1771 	trace_tcp_bad_csum(skb);
1772 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1773 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1774 	goto discard;
1775 }
1776 EXPORT_SYMBOL(tcp_v4_do_rcv);
1777 
1778 int tcp_v4_early_demux(struct sk_buff *skb)
1779 {
1780 	struct net *net = dev_net(skb->dev);
1781 	const struct iphdr *iph;
1782 	const struct tcphdr *th;
1783 	struct sock *sk;
1784 
1785 	if (skb->pkt_type != PACKET_HOST)
1786 		return 0;
1787 
1788 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1789 		return 0;
1790 
1791 	iph = ip_hdr(skb);
1792 	th = tcp_hdr(skb);
1793 
1794 	if (th->doff < sizeof(struct tcphdr) / 4)
1795 		return 0;
1796 
1797 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1798 				       iph->saddr, th->source,
1799 				       iph->daddr, ntohs(th->dest),
1800 				       skb->skb_iif, inet_sdif(skb));
1801 	if (sk) {
1802 		skb->sk = sk;
1803 		skb->destructor = sock_edemux;
1804 		if (sk_fullsock(sk)) {
1805 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1806 
1807 			if (dst)
1808 				dst = dst_check(dst, 0);
1809 			if (dst &&
1810 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
1811 				skb_dst_set_noref(skb, dst);
1812 		}
1813 	}
1814 	return 0;
1815 }
1816 
1817 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1818 		     enum skb_drop_reason *reason)
1819 {
1820 	u32 limit, tail_gso_size, tail_gso_segs;
1821 	struct skb_shared_info *shinfo;
1822 	const struct tcphdr *th;
1823 	struct tcphdr *thtail;
1824 	struct sk_buff *tail;
1825 	unsigned int hdrlen;
1826 	bool fragstolen;
1827 	u32 gso_segs;
1828 	u32 gso_size;
1829 	int delta;
1830 
1831 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1832 	 * we can fix skb->truesize to its real value to avoid future drops.
1833 	 * This is valid because skb is not yet charged to the socket.
1834 	 * It has been noticed pure SACK packets were sometimes dropped
1835 	 * (if cooked by drivers without copybreak feature).
1836 	 */
1837 	skb_condense(skb);
1838 
1839 	skb_dst_drop(skb);
1840 
1841 	if (unlikely(tcp_checksum_complete(skb))) {
1842 		bh_unlock_sock(sk);
1843 		trace_tcp_bad_csum(skb);
1844 		*reason = SKB_DROP_REASON_TCP_CSUM;
1845 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1846 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1847 		return true;
1848 	}
1849 
1850 	/* Attempt coalescing to last skb in backlog, even if we are
1851 	 * above the limits.
1852 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1853 	 */
1854 	th = (const struct tcphdr *)skb->data;
1855 	hdrlen = th->doff * 4;
1856 
1857 	tail = sk->sk_backlog.tail;
1858 	if (!tail)
1859 		goto no_coalesce;
1860 	thtail = (struct tcphdr *)tail->data;
1861 
1862 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1863 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1864 	    ((TCP_SKB_CB(tail)->tcp_flags |
1865 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1866 	    !((TCP_SKB_CB(tail)->tcp_flags &
1867 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1868 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1869 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1870 #ifdef CONFIG_TLS_DEVICE
1871 	    tail->decrypted != skb->decrypted ||
1872 #endif
1873 	    !mptcp_skb_can_collapse(tail, skb) ||
1874 	    thtail->doff != th->doff ||
1875 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1876 		goto no_coalesce;
1877 
1878 	__skb_pull(skb, hdrlen);
1879 
1880 	shinfo = skb_shinfo(skb);
1881 	gso_size = shinfo->gso_size ?: skb->len;
1882 	gso_segs = shinfo->gso_segs ?: 1;
1883 
1884 	shinfo = skb_shinfo(tail);
1885 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1886 	tail_gso_segs = shinfo->gso_segs ?: 1;
1887 
1888 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1889 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1890 
1891 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1892 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1893 			thtail->window = th->window;
1894 		}
1895 
1896 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1897 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1898 		 * is not entered if we append a packet with a FIN.
1899 		 * SYN, RST, URG are not present.
1900 		 * ACK is set on both packets.
1901 		 * PSH : we do not really care in TCP stack,
1902 		 *       at least for 'GRO' packets.
1903 		 */
1904 		thtail->fin |= th->fin;
1905 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1906 
1907 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1908 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1909 			tail->tstamp = skb->tstamp;
1910 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1911 		}
1912 
1913 		/* Not as strict as GRO. We only need to carry mss max value */
1914 		shinfo->gso_size = max(gso_size, tail_gso_size);
1915 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1916 
1917 		sk->sk_backlog.len += delta;
1918 		__NET_INC_STATS(sock_net(sk),
1919 				LINUX_MIB_TCPBACKLOGCOALESCE);
1920 		kfree_skb_partial(skb, fragstolen);
1921 		return false;
1922 	}
1923 	__skb_push(skb, hdrlen);
1924 
1925 no_coalesce:
1926 	limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1927 
1928 	/* Only socket owner can try to collapse/prune rx queues
1929 	 * to reduce memory overhead, so add a little headroom here.
1930 	 * Few sockets backlog are possibly concurrently non empty.
1931 	 */
1932 	limit += 64 * 1024;
1933 
1934 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1935 		bh_unlock_sock(sk);
1936 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1937 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1938 		return true;
1939 	}
1940 	return false;
1941 }
1942 EXPORT_SYMBOL(tcp_add_backlog);
1943 
1944 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1945 {
1946 	struct tcphdr *th = (struct tcphdr *)skb->data;
1947 
1948 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1949 }
1950 EXPORT_SYMBOL(tcp_filter);
1951 
1952 static void tcp_v4_restore_cb(struct sk_buff *skb)
1953 {
1954 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1955 		sizeof(struct inet_skb_parm));
1956 }
1957 
1958 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1959 			   const struct tcphdr *th)
1960 {
1961 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1962 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1963 	 */
1964 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1965 		sizeof(struct inet_skb_parm));
1966 	barrier();
1967 
1968 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1969 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1970 				    skb->len - th->doff * 4);
1971 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1972 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1973 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1974 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1975 	TCP_SKB_CB(skb)->sacked	 = 0;
1976 	TCP_SKB_CB(skb)->has_rxtstamp =
1977 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1978 }
1979 
1980 /*
1981  *	From tcp_input.c
1982  */
1983 
1984 int tcp_v4_rcv(struct sk_buff *skb)
1985 {
1986 	struct net *net = dev_net(skb->dev);
1987 	enum skb_drop_reason drop_reason;
1988 	int sdif = inet_sdif(skb);
1989 	int dif = inet_iif(skb);
1990 	const struct iphdr *iph;
1991 	const struct tcphdr *th;
1992 	bool refcounted;
1993 	struct sock *sk;
1994 	int ret;
1995 
1996 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1997 	if (skb->pkt_type != PACKET_HOST)
1998 		goto discard_it;
1999 
2000 	/* Count it even if it's bad */
2001 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2002 
2003 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2004 		goto discard_it;
2005 
2006 	th = (const struct tcphdr *)skb->data;
2007 
2008 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2009 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2010 		goto bad_packet;
2011 	}
2012 	if (!pskb_may_pull(skb, th->doff * 4))
2013 		goto discard_it;
2014 
2015 	/* An explanation is required here, I think.
2016 	 * Packet length and doff are validated by header prediction,
2017 	 * provided case of th->doff==0 is eliminated.
2018 	 * So, we defer the checks. */
2019 
2020 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2021 		goto csum_error;
2022 
2023 	th = (const struct tcphdr *)skb->data;
2024 	iph = ip_hdr(skb);
2025 lookup:
2026 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2027 			       skb, __tcp_hdrlen(th), th->source,
2028 			       th->dest, sdif, &refcounted);
2029 	if (!sk)
2030 		goto no_tcp_socket;
2031 
2032 process:
2033 	if (sk->sk_state == TCP_TIME_WAIT)
2034 		goto do_time_wait;
2035 
2036 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2037 		struct request_sock *req = inet_reqsk(sk);
2038 		bool req_stolen = false;
2039 		struct sock *nsk;
2040 
2041 		sk = req->rsk_listener;
2042 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2043 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2044 		else
2045 			drop_reason = tcp_inbound_md5_hash(sk, skb,
2046 						   &iph->saddr, &iph->daddr,
2047 						   AF_INET, dif, sdif);
2048 		if (unlikely(drop_reason)) {
2049 			sk_drops_add(sk, skb);
2050 			reqsk_put(req);
2051 			goto discard_it;
2052 		}
2053 		if (tcp_checksum_complete(skb)) {
2054 			reqsk_put(req);
2055 			goto csum_error;
2056 		}
2057 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2058 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2059 			if (!nsk) {
2060 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2061 				goto lookup;
2062 			}
2063 			sk = nsk;
2064 			/* reuseport_migrate_sock() has already held one sk_refcnt
2065 			 * before returning.
2066 			 */
2067 		} else {
2068 			/* We own a reference on the listener, increase it again
2069 			 * as we might lose it too soon.
2070 			 */
2071 			sock_hold(sk);
2072 		}
2073 		refcounted = true;
2074 		nsk = NULL;
2075 		if (!tcp_filter(sk, skb)) {
2076 			th = (const struct tcphdr *)skb->data;
2077 			iph = ip_hdr(skb);
2078 			tcp_v4_fill_cb(skb, iph, th);
2079 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2080 		} else {
2081 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2082 		}
2083 		if (!nsk) {
2084 			reqsk_put(req);
2085 			if (req_stolen) {
2086 				/* Another cpu got exclusive access to req
2087 				 * and created a full blown socket.
2088 				 * Try to feed this packet to this socket
2089 				 * instead of discarding it.
2090 				 */
2091 				tcp_v4_restore_cb(skb);
2092 				sock_put(sk);
2093 				goto lookup;
2094 			}
2095 			goto discard_and_relse;
2096 		}
2097 		nf_reset_ct(skb);
2098 		if (nsk == sk) {
2099 			reqsk_put(req);
2100 			tcp_v4_restore_cb(skb);
2101 		} else if (tcp_child_process(sk, nsk, skb)) {
2102 			tcp_v4_send_reset(nsk, skb);
2103 			goto discard_and_relse;
2104 		} else {
2105 			sock_put(sk);
2106 			return 0;
2107 		}
2108 	}
2109 
2110 	if (static_branch_unlikely(&ip4_min_ttl)) {
2111 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2112 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2113 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2114 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2115 			goto discard_and_relse;
2116 		}
2117 	}
2118 
2119 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2120 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2121 		goto discard_and_relse;
2122 	}
2123 
2124 	drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2125 					   &iph->daddr, AF_INET, dif, sdif);
2126 	if (drop_reason)
2127 		goto discard_and_relse;
2128 
2129 	nf_reset_ct(skb);
2130 
2131 	if (tcp_filter(sk, skb)) {
2132 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2133 		goto discard_and_relse;
2134 	}
2135 	th = (const struct tcphdr *)skb->data;
2136 	iph = ip_hdr(skb);
2137 	tcp_v4_fill_cb(skb, iph, th);
2138 
2139 	skb->dev = NULL;
2140 
2141 	if (sk->sk_state == TCP_LISTEN) {
2142 		ret = tcp_v4_do_rcv(sk, skb);
2143 		goto put_and_return;
2144 	}
2145 
2146 	sk_incoming_cpu_update(sk);
2147 
2148 	bh_lock_sock_nested(sk);
2149 	tcp_segs_in(tcp_sk(sk), skb);
2150 	ret = 0;
2151 	if (!sock_owned_by_user(sk)) {
2152 		ret = tcp_v4_do_rcv(sk, skb);
2153 	} else {
2154 		if (tcp_add_backlog(sk, skb, &drop_reason))
2155 			goto discard_and_relse;
2156 	}
2157 	bh_unlock_sock(sk);
2158 
2159 put_and_return:
2160 	if (refcounted)
2161 		sock_put(sk);
2162 
2163 	return ret;
2164 
2165 no_tcp_socket:
2166 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2167 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2168 		goto discard_it;
2169 
2170 	tcp_v4_fill_cb(skb, iph, th);
2171 
2172 	if (tcp_checksum_complete(skb)) {
2173 csum_error:
2174 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2175 		trace_tcp_bad_csum(skb);
2176 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2177 bad_packet:
2178 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2179 	} else {
2180 		tcp_v4_send_reset(NULL, skb);
2181 	}
2182 
2183 discard_it:
2184 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2185 	/* Discard frame. */
2186 	kfree_skb_reason(skb, drop_reason);
2187 	return 0;
2188 
2189 discard_and_relse:
2190 	sk_drops_add(sk, skb);
2191 	if (refcounted)
2192 		sock_put(sk);
2193 	goto discard_it;
2194 
2195 do_time_wait:
2196 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2197 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2198 		inet_twsk_put(inet_twsk(sk));
2199 		goto discard_it;
2200 	}
2201 
2202 	tcp_v4_fill_cb(skb, iph, th);
2203 
2204 	if (tcp_checksum_complete(skb)) {
2205 		inet_twsk_put(inet_twsk(sk));
2206 		goto csum_error;
2207 	}
2208 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2209 	case TCP_TW_SYN: {
2210 		struct sock *sk2 = inet_lookup_listener(net,
2211 							net->ipv4.tcp_death_row.hashinfo,
2212 							skb, __tcp_hdrlen(th),
2213 							iph->saddr, th->source,
2214 							iph->daddr, th->dest,
2215 							inet_iif(skb),
2216 							sdif);
2217 		if (sk2) {
2218 			inet_twsk_deschedule_put(inet_twsk(sk));
2219 			sk = sk2;
2220 			tcp_v4_restore_cb(skb);
2221 			refcounted = false;
2222 			goto process;
2223 		}
2224 	}
2225 		/* to ACK */
2226 		fallthrough;
2227 	case TCP_TW_ACK:
2228 		tcp_v4_timewait_ack(sk, skb);
2229 		break;
2230 	case TCP_TW_RST:
2231 		tcp_v4_send_reset(sk, skb);
2232 		inet_twsk_deschedule_put(inet_twsk(sk));
2233 		goto discard_it;
2234 	case TCP_TW_SUCCESS:;
2235 	}
2236 	goto discard_it;
2237 }
2238 
2239 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2240 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2241 	.twsk_unique	= tcp_twsk_unique,
2242 	.twsk_destructor= tcp_twsk_destructor,
2243 };
2244 
2245 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2246 {
2247 	struct dst_entry *dst = skb_dst(skb);
2248 
2249 	if (dst && dst_hold_safe(dst)) {
2250 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2251 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2252 	}
2253 }
2254 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2255 
2256 const struct inet_connection_sock_af_ops ipv4_specific = {
2257 	.queue_xmit	   = ip_queue_xmit,
2258 	.send_check	   = tcp_v4_send_check,
2259 	.rebuild_header	   = inet_sk_rebuild_header,
2260 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2261 	.conn_request	   = tcp_v4_conn_request,
2262 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2263 	.net_header_len	   = sizeof(struct iphdr),
2264 	.setsockopt	   = ip_setsockopt,
2265 	.getsockopt	   = ip_getsockopt,
2266 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2267 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2268 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2269 };
2270 EXPORT_SYMBOL(ipv4_specific);
2271 
2272 #ifdef CONFIG_TCP_MD5SIG
2273 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2274 	.md5_lookup		= tcp_v4_md5_lookup,
2275 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2276 	.md5_parse		= tcp_v4_parse_md5_keys,
2277 };
2278 #endif
2279 
2280 /* NOTE: A lot of things set to zero explicitly by call to
2281  *       sk_alloc() so need not be done here.
2282  */
2283 static int tcp_v4_init_sock(struct sock *sk)
2284 {
2285 	struct inet_connection_sock *icsk = inet_csk(sk);
2286 
2287 	tcp_init_sock(sk);
2288 
2289 	icsk->icsk_af_ops = &ipv4_specific;
2290 
2291 #ifdef CONFIG_TCP_MD5SIG
2292 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2293 #endif
2294 
2295 	return 0;
2296 }
2297 
2298 void tcp_v4_destroy_sock(struct sock *sk)
2299 {
2300 	struct tcp_sock *tp = tcp_sk(sk);
2301 
2302 	trace_tcp_destroy_sock(sk);
2303 
2304 	tcp_clear_xmit_timers(sk);
2305 
2306 	tcp_cleanup_congestion_control(sk);
2307 
2308 	tcp_cleanup_ulp(sk);
2309 
2310 	/* Cleanup up the write buffer. */
2311 	tcp_write_queue_purge(sk);
2312 
2313 	/* Check if we want to disable active TFO */
2314 	tcp_fastopen_active_disable_ofo_check(sk);
2315 
2316 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2317 	skb_rbtree_purge(&tp->out_of_order_queue);
2318 
2319 #ifdef CONFIG_TCP_MD5SIG
2320 	/* Clean up the MD5 key list, if any */
2321 	if (tp->md5sig_info) {
2322 		tcp_clear_md5_list(sk);
2323 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2324 		tp->md5sig_info = NULL;
2325 		static_branch_slow_dec_deferred(&tcp_md5_needed);
2326 	}
2327 #endif
2328 
2329 	/* Clean up a referenced TCP bind bucket. */
2330 	if (inet_csk(sk)->icsk_bind_hash)
2331 		inet_put_port(sk);
2332 
2333 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2334 
2335 	/* If socket is aborted during connect operation */
2336 	tcp_free_fastopen_req(tp);
2337 	tcp_fastopen_destroy_cipher(sk);
2338 	tcp_saved_syn_free(tp);
2339 
2340 	sk_sockets_allocated_dec(sk);
2341 }
2342 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2343 
2344 #ifdef CONFIG_PROC_FS
2345 /* Proc filesystem TCP sock list dumping. */
2346 
2347 static unsigned short seq_file_family(const struct seq_file *seq);
2348 
2349 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2350 {
2351 	unsigned short family = seq_file_family(seq);
2352 
2353 	/* AF_UNSPEC is used as a match all */
2354 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2355 		net_eq(sock_net(sk), seq_file_net(seq)));
2356 }
2357 
2358 /* Find a non empty bucket (starting from st->bucket)
2359  * and return the first sk from it.
2360  */
2361 static void *listening_get_first(struct seq_file *seq)
2362 {
2363 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2364 	struct tcp_iter_state *st = seq->private;
2365 
2366 	st->offset = 0;
2367 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2368 		struct inet_listen_hashbucket *ilb2;
2369 		struct hlist_nulls_node *node;
2370 		struct sock *sk;
2371 
2372 		ilb2 = &hinfo->lhash2[st->bucket];
2373 		if (hlist_nulls_empty(&ilb2->nulls_head))
2374 			continue;
2375 
2376 		spin_lock(&ilb2->lock);
2377 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2378 			if (seq_sk_match(seq, sk))
2379 				return sk;
2380 		}
2381 		spin_unlock(&ilb2->lock);
2382 	}
2383 
2384 	return NULL;
2385 }
2386 
2387 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2388  * If "cur" is the last one in the st->bucket,
2389  * call listening_get_first() to return the first sk of the next
2390  * non empty bucket.
2391  */
2392 static void *listening_get_next(struct seq_file *seq, void *cur)
2393 {
2394 	struct tcp_iter_state *st = seq->private;
2395 	struct inet_listen_hashbucket *ilb2;
2396 	struct hlist_nulls_node *node;
2397 	struct inet_hashinfo *hinfo;
2398 	struct sock *sk = cur;
2399 
2400 	++st->num;
2401 	++st->offset;
2402 
2403 	sk = sk_nulls_next(sk);
2404 	sk_nulls_for_each_from(sk, node) {
2405 		if (seq_sk_match(seq, sk))
2406 			return sk;
2407 	}
2408 
2409 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2410 	ilb2 = &hinfo->lhash2[st->bucket];
2411 	spin_unlock(&ilb2->lock);
2412 	++st->bucket;
2413 	return listening_get_first(seq);
2414 }
2415 
2416 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2417 {
2418 	struct tcp_iter_state *st = seq->private;
2419 	void *rc;
2420 
2421 	st->bucket = 0;
2422 	st->offset = 0;
2423 	rc = listening_get_first(seq);
2424 
2425 	while (rc && *pos) {
2426 		rc = listening_get_next(seq, rc);
2427 		--*pos;
2428 	}
2429 	return rc;
2430 }
2431 
2432 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2433 				const struct tcp_iter_state *st)
2434 {
2435 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2436 }
2437 
2438 /*
2439  * Get first established socket starting from bucket given in st->bucket.
2440  * If st->bucket is zero, the very first socket in the hash is returned.
2441  */
2442 static void *established_get_first(struct seq_file *seq)
2443 {
2444 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2445 	struct tcp_iter_state *st = seq->private;
2446 
2447 	st->offset = 0;
2448 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2449 		struct sock *sk;
2450 		struct hlist_nulls_node *node;
2451 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2452 
2453 		cond_resched();
2454 
2455 		/* Lockless fast path for the common case of empty buckets */
2456 		if (empty_bucket(hinfo, st))
2457 			continue;
2458 
2459 		spin_lock_bh(lock);
2460 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2461 			if (seq_sk_match(seq, sk))
2462 				return sk;
2463 		}
2464 		spin_unlock_bh(lock);
2465 	}
2466 
2467 	return NULL;
2468 }
2469 
2470 static void *established_get_next(struct seq_file *seq, void *cur)
2471 {
2472 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2473 	struct tcp_iter_state *st = seq->private;
2474 	struct hlist_nulls_node *node;
2475 	struct sock *sk = cur;
2476 
2477 	++st->num;
2478 	++st->offset;
2479 
2480 	sk = sk_nulls_next(sk);
2481 
2482 	sk_nulls_for_each_from(sk, node) {
2483 		if (seq_sk_match(seq, sk))
2484 			return sk;
2485 	}
2486 
2487 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2488 	++st->bucket;
2489 	return established_get_first(seq);
2490 }
2491 
2492 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2493 {
2494 	struct tcp_iter_state *st = seq->private;
2495 	void *rc;
2496 
2497 	st->bucket = 0;
2498 	rc = established_get_first(seq);
2499 
2500 	while (rc && pos) {
2501 		rc = established_get_next(seq, rc);
2502 		--pos;
2503 	}
2504 	return rc;
2505 }
2506 
2507 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2508 {
2509 	void *rc;
2510 	struct tcp_iter_state *st = seq->private;
2511 
2512 	st->state = TCP_SEQ_STATE_LISTENING;
2513 	rc	  = listening_get_idx(seq, &pos);
2514 
2515 	if (!rc) {
2516 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2517 		rc	  = established_get_idx(seq, pos);
2518 	}
2519 
2520 	return rc;
2521 }
2522 
2523 static void *tcp_seek_last_pos(struct seq_file *seq)
2524 {
2525 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2526 	struct tcp_iter_state *st = seq->private;
2527 	int bucket = st->bucket;
2528 	int offset = st->offset;
2529 	int orig_num = st->num;
2530 	void *rc = NULL;
2531 
2532 	switch (st->state) {
2533 	case TCP_SEQ_STATE_LISTENING:
2534 		if (st->bucket > hinfo->lhash2_mask)
2535 			break;
2536 		rc = listening_get_first(seq);
2537 		while (offset-- && rc && bucket == st->bucket)
2538 			rc = listening_get_next(seq, rc);
2539 		if (rc)
2540 			break;
2541 		st->bucket = 0;
2542 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2543 		fallthrough;
2544 	case TCP_SEQ_STATE_ESTABLISHED:
2545 		if (st->bucket > hinfo->ehash_mask)
2546 			break;
2547 		rc = established_get_first(seq);
2548 		while (offset-- && rc && bucket == st->bucket)
2549 			rc = established_get_next(seq, rc);
2550 	}
2551 
2552 	st->num = orig_num;
2553 
2554 	return rc;
2555 }
2556 
2557 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2558 {
2559 	struct tcp_iter_state *st = seq->private;
2560 	void *rc;
2561 
2562 	if (*pos && *pos == st->last_pos) {
2563 		rc = tcp_seek_last_pos(seq);
2564 		if (rc)
2565 			goto out;
2566 	}
2567 
2568 	st->state = TCP_SEQ_STATE_LISTENING;
2569 	st->num = 0;
2570 	st->bucket = 0;
2571 	st->offset = 0;
2572 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2573 
2574 out:
2575 	st->last_pos = *pos;
2576 	return rc;
2577 }
2578 EXPORT_SYMBOL(tcp_seq_start);
2579 
2580 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2581 {
2582 	struct tcp_iter_state *st = seq->private;
2583 	void *rc = NULL;
2584 
2585 	if (v == SEQ_START_TOKEN) {
2586 		rc = tcp_get_idx(seq, 0);
2587 		goto out;
2588 	}
2589 
2590 	switch (st->state) {
2591 	case TCP_SEQ_STATE_LISTENING:
2592 		rc = listening_get_next(seq, v);
2593 		if (!rc) {
2594 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2595 			st->bucket = 0;
2596 			st->offset = 0;
2597 			rc	  = established_get_first(seq);
2598 		}
2599 		break;
2600 	case TCP_SEQ_STATE_ESTABLISHED:
2601 		rc = established_get_next(seq, v);
2602 		break;
2603 	}
2604 out:
2605 	++*pos;
2606 	st->last_pos = *pos;
2607 	return rc;
2608 }
2609 EXPORT_SYMBOL(tcp_seq_next);
2610 
2611 void tcp_seq_stop(struct seq_file *seq, void *v)
2612 {
2613 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2614 	struct tcp_iter_state *st = seq->private;
2615 
2616 	switch (st->state) {
2617 	case TCP_SEQ_STATE_LISTENING:
2618 		if (v != SEQ_START_TOKEN)
2619 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2620 		break;
2621 	case TCP_SEQ_STATE_ESTABLISHED:
2622 		if (v)
2623 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2624 		break;
2625 	}
2626 }
2627 EXPORT_SYMBOL(tcp_seq_stop);
2628 
2629 static void get_openreq4(const struct request_sock *req,
2630 			 struct seq_file *f, int i)
2631 {
2632 	const struct inet_request_sock *ireq = inet_rsk(req);
2633 	long delta = req->rsk_timer.expires - jiffies;
2634 
2635 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2636 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2637 		i,
2638 		ireq->ir_loc_addr,
2639 		ireq->ir_num,
2640 		ireq->ir_rmt_addr,
2641 		ntohs(ireq->ir_rmt_port),
2642 		TCP_SYN_RECV,
2643 		0, 0, /* could print option size, but that is af dependent. */
2644 		1,    /* timers active (only the expire timer) */
2645 		jiffies_delta_to_clock_t(delta),
2646 		req->num_timeout,
2647 		from_kuid_munged(seq_user_ns(f),
2648 				 sock_i_uid(req->rsk_listener)),
2649 		0,  /* non standard timer */
2650 		0, /* open_requests have no inode */
2651 		0,
2652 		req);
2653 }
2654 
2655 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2656 {
2657 	int timer_active;
2658 	unsigned long timer_expires;
2659 	const struct tcp_sock *tp = tcp_sk(sk);
2660 	const struct inet_connection_sock *icsk = inet_csk(sk);
2661 	const struct inet_sock *inet = inet_sk(sk);
2662 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2663 	__be32 dest = inet->inet_daddr;
2664 	__be32 src = inet->inet_rcv_saddr;
2665 	__u16 destp = ntohs(inet->inet_dport);
2666 	__u16 srcp = ntohs(inet->inet_sport);
2667 	int rx_queue;
2668 	int state;
2669 
2670 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2671 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2672 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2673 		timer_active	= 1;
2674 		timer_expires	= icsk->icsk_timeout;
2675 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2676 		timer_active	= 4;
2677 		timer_expires	= icsk->icsk_timeout;
2678 	} else if (timer_pending(&sk->sk_timer)) {
2679 		timer_active	= 2;
2680 		timer_expires	= sk->sk_timer.expires;
2681 	} else {
2682 		timer_active	= 0;
2683 		timer_expires = jiffies;
2684 	}
2685 
2686 	state = inet_sk_state_load(sk);
2687 	if (state == TCP_LISTEN)
2688 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2689 	else
2690 		/* Because we don't lock the socket,
2691 		 * we might find a transient negative value.
2692 		 */
2693 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2694 				      READ_ONCE(tp->copied_seq), 0);
2695 
2696 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2697 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2698 		i, src, srcp, dest, destp, state,
2699 		READ_ONCE(tp->write_seq) - tp->snd_una,
2700 		rx_queue,
2701 		timer_active,
2702 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2703 		icsk->icsk_retransmits,
2704 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2705 		icsk->icsk_probes_out,
2706 		sock_i_ino(sk),
2707 		refcount_read(&sk->sk_refcnt), sk,
2708 		jiffies_to_clock_t(icsk->icsk_rto),
2709 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2710 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2711 		tcp_snd_cwnd(tp),
2712 		state == TCP_LISTEN ?
2713 		    fastopenq->max_qlen :
2714 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2715 }
2716 
2717 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2718 			       struct seq_file *f, int i)
2719 {
2720 	long delta = tw->tw_timer.expires - jiffies;
2721 	__be32 dest, src;
2722 	__u16 destp, srcp;
2723 
2724 	dest  = tw->tw_daddr;
2725 	src   = tw->tw_rcv_saddr;
2726 	destp = ntohs(tw->tw_dport);
2727 	srcp  = ntohs(tw->tw_sport);
2728 
2729 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2730 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2731 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2732 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2733 		refcount_read(&tw->tw_refcnt), tw);
2734 }
2735 
2736 #define TMPSZ 150
2737 
2738 static int tcp4_seq_show(struct seq_file *seq, void *v)
2739 {
2740 	struct tcp_iter_state *st;
2741 	struct sock *sk = v;
2742 
2743 	seq_setwidth(seq, TMPSZ - 1);
2744 	if (v == SEQ_START_TOKEN) {
2745 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2746 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2747 			   "inode");
2748 		goto out;
2749 	}
2750 	st = seq->private;
2751 
2752 	if (sk->sk_state == TCP_TIME_WAIT)
2753 		get_timewait4_sock(v, seq, st->num);
2754 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2755 		get_openreq4(v, seq, st->num);
2756 	else
2757 		get_tcp4_sock(v, seq, st->num);
2758 out:
2759 	seq_pad(seq, '\n');
2760 	return 0;
2761 }
2762 
2763 #ifdef CONFIG_BPF_SYSCALL
2764 struct bpf_tcp_iter_state {
2765 	struct tcp_iter_state state;
2766 	unsigned int cur_sk;
2767 	unsigned int end_sk;
2768 	unsigned int max_sk;
2769 	struct sock **batch;
2770 	bool st_bucket_done;
2771 };
2772 
2773 struct bpf_iter__tcp {
2774 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2775 	__bpf_md_ptr(struct sock_common *, sk_common);
2776 	uid_t uid __aligned(8);
2777 };
2778 
2779 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2780 			     struct sock_common *sk_common, uid_t uid)
2781 {
2782 	struct bpf_iter__tcp ctx;
2783 
2784 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2785 	ctx.meta = meta;
2786 	ctx.sk_common = sk_common;
2787 	ctx.uid = uid;
2788 	return bpf_iter_run_prog(prog, &ctx);
2789 }
2790 
2791 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2792 {
2793 	while (iter->cur_sk < iter->end_sk)
2794 		sock_gen_put(iter->batch[iter->cur_sk++]);
2795 }
2796 
2797 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2798 				      unsigned int new_batch_sz)
2799 {
2800 	struct sock **new_batch;
2801 
2802 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2803 			     GFP_USER | __GFP_NOWARN);
2804 	if (!new_batch)
2805 		return -ENOMEM;
2806 
2807 	bpf_iter_tcp_put_batch(iter);
2808 	kvfree(iter->batch);
2809 	iter->batch = new_batch;
2810 	iter->max_sk = new_batch_sz;
2811 
2812 	return 0;
2813 }
2814 
2815 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2816 						 struct sock *start_sk)
2817 {
2818 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2819 	struct bpf_tcp_iter_state *iter = seq->private;
2820 	struct tcp_iter_state *st = &iter->state;
2821 	struct hlist_nulls_node *node;
2822 	unsigned int expected = 1;
2823 	struct sock *sk;
2824 
2825 	sock_hold(start_sk);
2826 	iter->batch[iter->end_sk++] = start_sk;
2827 
2828 	sk = sk_nulls_next(start_sk);
2829 	sk_nulls_for_each_from(sk, node) {
2830 		if (seq_sk_match(seq, sk)) {
2831 			if (iter->end_sk < iter->max_sk) {
2832 				sock_hold(sk);
2833 				iter->batch[iter->end_sk++] = sk;
2834 			}
2835 			expected++;
2836 		}
2837 	}
2838 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
2839 
2840 	return expected;
2841 }
2842 
2843 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2844 						   struct sock *start_sk)
2845 {
2846 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2847 	struct bpf_tcp_iter_state *iter = seq->private;
2848 	struct tcp_iter_state *st = &iter->state;
2849 	struct hlist_nulls_node *node;
2850 	unsigned int expected = 1;
2851 	struct sock *sk;
2852 
2853 	sock_hold(start_sk);
2854 	iter->batch[iter->end_sk++] = start_sk;
2855 
2856 	sk = sk_nulls_next(start_sk);
2857 	sk_nulls_for_each_from(sk, node) {
2858 		if (seq_sk_match(seq, sk)) {
2859 			if (iter->end_sk < iter->max_sk) {
2860 				sock_hold(sk);
2861 				iter->batch[iter->end_sk++] = sk;
2862 			}
2863 			expected++;
2864 		}
2865 	}
2866 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2867 
2868 	return expected;
2869 }
2870 
2871 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2872 {
2873 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2874 	struct bpf_tcp_iter_state *iter = seq->private;
2875 	struct tcp_iter_state *st = &iter->state;
2876 	unsigned int expected;
2877 	bool resized = false;
2878 	struct sock *sk;
2879 
2880 	/* The st->bucket is done.  Directly advance to the next
2881 	 * bucket instead of having the tcp_seek_last_pos() to skip
2882 	 * one by one in the current bucket and eventually find out
2883 	 * it has to advance to the next bucket.
2884 	 */
2885 	if (iter->st_bucket_done) {
2886 		st->offset = 0;
2887 		st->bucket++;
2888 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2889 		    st->bucket > hinfo->lhash2_mask) {
2890 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2891 			st->bucket = 0;
2892 		}
2893 	}
2894 
2895 again:
2896 	/* Get a new batch */
2897 	iter->cur_sk = 0;
2898 	iter->end_sk = 0;
2899 	iter->st_bucket_done = false;
2900 
2901 	sk = tcp_seek_last_pos(seq);
2902 	if (!sk)
2903 		return NULL; /* Done */
2904 
2905 	if (st->state == TCP_SEQ_STATE_LISTENING)
2906 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2907 	else
2908 		expected = bpf_iter_tcp_established_batch(seq, sk);
2909 
2910 	if (iter->end_sk == expected) {
2911 		iter->st_bucket_done = true;
2912 		return sk;
2913 	}
2914 
2915 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2916 		resized = true;
2917 		goto again;
2918 	}
2919 
2920 	return sk;
2921 }
2922 
2923 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2924 {
2925 	/* bpf iter does not support lseek, so it always
2926 	 * continue from where it was stop()-ped.
2927 	 */
2928 	if (*pos)
2929 		return bpf_iter_tcp_batch(seq);
2930 
2931 	return SEQ_START_TOKEN;
2932 }
2933 
2934 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2935 {
2936 	struct bpf_tcp_iter_state *iter = seq->private;
2937 	struct tcp_iter_state *st = &iter->state;
2938 	struct sock *sk;
2939 
2940 	/* Whenever seq_next() is called, the iter->cur_sk is
2941 	 * done with seq_show(), so advance to the next sk in
2942 	 * the batch.
2943 	 */
2944 	if (iter->cur_sk < iter->end_sk) {
2945 		/* Keeping st->num consistent in tcp_iter_state.
2946 		 * bpf_iter_tcp does not use st->num.
2947 		 * meta.seq_num is used instead.
2948 		 */
2949 		st->num++;
2950 		/* Move st->offset to the next sk in the bucket such that
2951 		 * the future start() will resume at st->offset in
2952 		 * st->bucket.  See tcp_seek_last_pos().
2953 		 */
2954 		st->offset++;
2955 		sock_gen_put(iter->batch[iter->cur_sk++]);
2956 	}
2957 
2958 	if (iter->cur_sk < iter->end_sk)
2959 		sk = iter->batch[iter->cur_sk];
2960 	else
2961 		sk = bpf_iter_tcp_batch(seq);
2962 
2963 	++*pos;
2964 	/* Keeping st->last_pos consistent in tcp_iter_state.
2965 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2966 	 */
2967 	st->last_pos = *pos;
2968 	return sk;
2969 }
2970 
2971 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2972 {
2973 	struct bpf_iter_meta meta;
2974 	struct bpf_prog *prog;
2975 	struct sock *sk = v;
2976 	uid_t uid;
2977 	int ret;
2978 
2979 	if (v == SEQ_START_TOKEN)
2980 		return 0;
2981 
2982 	if (sk_fullsock(sk))
2983 		lock_sock(sk);
2984 
2985 	if (unlikely(sk_unhashed(sk))) {
2986 		ret = SEQ_SKIP;
2987 		goto unlock;
2988 	}
2989 
2990 	if (sk->sk_state == TCP_TIME_WAIT) {
2991 		uid = 0;
2992 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2993 		const struct request_sock *req = v;
2994 
2995 		uid = from_kuid_munged(seq_user_ns(seq),
2996 				       sock_i_uid(req->rsk_listener));
2997 	} else {
2998 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2999 	}
3000 
3001 	meta.seq = seq;
3002 	prog = bpf_iter_get_info(&meta, false);
3003 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3004 
3005 unlock:
3006 	if (sk_fullsock(sk))
3007 		release_sock(sk);
3008 	return ret;
3009 
3010 }
3011 
3012 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3013 {
3014 	struct bpf_tcp_iter_state *iter = seq->private;
3015 	struct bpf_iter_meta meta;
3016 	struct bpf_prog *prog;
3017 
3018 	if (!v) {
3019 		meta.seq = seq;
3020 		prog = bpf_iter_get_info(&meta, true);
3021 		if (prog)
3022 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3023 	}
3024 
3025 	if (iter->cur_sk < iter->end_sk) {
3026 		bpf_iter_tcp_put_batch(iter);
3027 		iter->st_bucket_done = false;
3028 	}
3029 }
3030 
3031 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3032 	.show		= bpf_iter_tcp_seq_show,
3033 	.start		= bpf_iter_tcp_seq_start,
3034 	.next		= bpf_iter_tcp_seq_next,
3035 	.stop		= bpf_iter_tcp_seq_stop,
3036 };
3037 #endif
3038 static unsigned short seq_file_family(const struct seq_file *seq)
3039 {
3040 	const struct tcp_seq_afinfo *afinfo;
3041 
3042 #ifdef CONFIG_BPF_SYSCALL
3043 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3044 	if (seq->op == &bpf_iter_tcp_seq_ops)
3045 		return AF_UNSPEC;
3046 #endif
3047 
3048 	/* Iterated from proc fs */
3049 	afinfo = pde_data(file_inode(seq->file));
3050 	return afinfo->family;
3051 }
3052 
3053 static const struct seq_operations tcp4_seq_ops = {
3054 	.show		= tcp4_seq_show,
3055 	.start		= tcp_seq_start,
3056 	.next		= tcp_seq_next,
3057 	.stop		= tcp_seq_stop,
3058 };
3059 
3060 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3061 	.family		= AF_INET,
3062 };
3063 
3064 static int __net_init tcp4_proc_init_net(struct net *net)
3065 {
3066 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3067 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3068 		return -ENOMEM;
3069 	return 0;
3070 }
3071 
3072 static void __net_exit tcp4_proc_exit_net(struct net *net)
3073 {
3074 	remove_proc_entry("tcp", net->proc_net);
3075 }
3076 
3077 static struct pernet_operations tcp4_net_ops = {
3078 	.init = tcp4_proc_init_net,
3079 	.exit = tcp4_proc_exit_net,
3080 };
3081 
3082 int __init tcp4_proc_init(void)
3083 {
3084 	return register_pernet_subsys(&tcp4_net_ops);
3085 }
3086 
3087 void tcp4_proc_exit(void)
3088 {
3089 	unregister_pernet_subsys(&tcp4_net_ops);
3090 }
3091 #endif /* CONFIG_PROC_FS */
3092 
3093 /* @wake is one when sk_stream_write_space() calls us.
3094  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3095  * This mimics the strategy used in sock_def_write_space().
3096  */
3097 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3098 {
3099 	const struct tcp_sock *tp = tcp_sk(sk);
3100 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3101 			    READ_ONCE(tp->snd_nxt);
3102 
3103 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3104 }
3105 EXPORT_SYMBOL(tcp_stream_memory_free);
3106 
3107 struct proto tcp_prot = {
3108 	.name			= "TCP",
3109 	.owner			= THIS_MODULE,
3110 	.close			= tcp_close,
3111 	.pre_connect		= tcp_v4_pre_connect,
3112 	.connect		= tcp_v4_connect,
3113 	.disconnect		= tcp_disconnect,
3114 	.accept			= inet_csk_accept,
3115 	.ioctl			= tcp_ioctl,
3116 	.init			= tcp_v4_init_sock,
3117 	.destroy		= tcp_v4_destroy_sock,
3118 	.shutdown		= tcp_shutdown,
3119 	.setsockopt		= tcp_setsockopt,
3120 	.getsockopt		= tcp_getsockopt,
3121 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3122 	.keepalive		= tcp_set_keepalive,
3123 	.recvmsg		= tcp_recvmsg,
3124 	.sendmsg		= tcp_sendmsg,
3125 	.splice_eof		= tcp_splice_eof,
3126 	.backlog_rcv		= tcp_v4_do_rcv,
3127 	.release_cb		= tcp_release_cb,
3128 	.hash			= inet_hash,
3129 	.unhash			= inet_unhash,
3130 	.get_port		= inet_csk_get_port,
3131 	.put_port		= inet_put_port,
3132 #ifdef CONFIG_BPF_SYSCALL
3133 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3134 #endif
3135 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3136 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3137 	.stream_memory_free	= tcp_stream_memory_free,
3138 	.sockets_allocated	= &tcp_sockets_allocated,
3139 	.orphan_count		= &tcp_orphan_count,
3140 
3141 	.memory_allocated	= &tcp_memory_allocated,
3142 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3143 
3144 	.memory_pressure	= &tcp_memory_pressure,
3145 	.sysctl_mem		= sysctl_tcp_mem,
3146 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3147 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3148 	.max_header		= MAX_TCP_HEADER,
3149 	.obj_size		= sizeof(struct tcp_sock),
3150 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3151 	.twsk_prot		= &tcp_timewait_sock_ops,
3152 	.rsk_prot		= &tcp_request_sock_ops,
3153 	.h.hashinfo		= NULL,
3154 	.no_autobind		= true,
3155 	.diag_destroy		= tcp_abort,
3156 };
3157 EXPORT_SYMBOL(tcp_prot);
3158 
3159 static void __net_exit tcp_sk_exit(struct net *net)
3160 {
3161 	if (net->ipv4.tcp_congestion_control)
3162 		bpf_module_put(net->ipv4.tcp_congestion_control,
3163 			       net->ipv4.tcp_congestion_control->owner);
3164 }
3165 
3166 static void __net_init tcp_set_hashinfo(struct net *net)
3167 {
3168 	struct inet_hashinfo *hinfo;
3169 	unsigned int ehash_entries;
3170 	struct net *old_net;
3171 
3172 	if (net_eq(net, &init_net))
3173 		goto fallback;
3174 
3175 	old_net = current->nsproxy->net_ns;
3176 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3177 	if (!ehash_entries)
3178 		goto fallback;
3179 
3180 	ehash_entries = roundup_pow_of_two(ehash_entries);
3181 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3182 	if (!hinfo) {
3183 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3184 			"for a netns, fallback to the global one\n",
3185 			ehash_entries);
3186 fallback:
3187 		hinfo = &tcp_hashinfo;
3188 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3189 	}
3190 
3191 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3192 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3193 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3194 }
3195 
3196 static int __net_init tcp_sk_init(struct net *net)
3197 {
3198 	net->ipv4.sysctl_tcp_ecn = 2;
3199 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3200 
3201 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3202 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3203 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3204 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3205 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3206 
3207 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3208 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3209 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3210 
3211 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3212 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3213 	net->ipv4.sysctl_tcp_syncookies = 1;
3214 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3215 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3216 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3217 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3218 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3219 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3220 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3221 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3222 
3223 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3224 	tcp_set_hashinfo(net);
3225 
3226 	net->ipv4.sysctl_tcp_sack = 1;
3227 	net->ipv4.sysctl_tcp_window_scaling = 1;
3228 	net->ipv4.sysctl_tcp_timestamps = 1;
3229 	net->ipv4.sysctl_tcp_early_retrans = 3;
3230 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3231 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3232 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3233 	net->ipv4.sysctl_tcp_max_reordering = 300;
3234 	net->ipv4.sysctl_tcp_dsack = 1;
3235 	net->ipv4.sysctl_tcp_app_win = 31;
3236 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3237 	net->ipv4.sysctl_tcp_frto = 2;
3238 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3239 	/* This limits the percentage of the congestion window which we
3240 	 * will allow a single TSO frame to consume.  Building TSO frames
3241 	 * which are too large can cause TCP streams to be bursty.
3242 	 */
3243 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3244 	/* Default TSQ limit of 16 TSO segments */
3245 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3246 
3247 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3248 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3249 
3250 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3251 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3252 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3253 	net->ipv4.sysctl_tcp_autocorking = 1;
3254 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3255 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3256 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3257 	if (net != &init_net) {
3258 		memcpy(net->ipv4.sysctl_tcp_rmem,
3259 		       init_net.ipv4.sysctl_tcp_rmem,
3260 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3261 		memcpy(net->ipv4.sysctl_tcp_wmem,
3262 		       init_net.ipv4.sysctl_tcp_wmem,
3263 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3264 	}
3265 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3266 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3267 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3268 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3269 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3270 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3271 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3272 
3273 	/* Set default values for PLB */
3274 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3275 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3276 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3277 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3278 	/* Default congestion threshold for PLB to mark a round is 50% */
3279 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3280 
3281 	/* Reno is always built in */
3282 	if (!net_eq(net, &init_net) &&
3283 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3284 			       init_net.ipv4.tcp_congestion_control->owner))
3285 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3286 	else
3287 		net->ipv4.tcp_congestion_control = &tcp_reno;
3288 
3289 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3290 	net->ipv4.sysctl_tcp_shrink_window = 0;
3291 
3292 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3293 
3294 	return 0;
3295 }
3296 
3297 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3298 {
3299 	struct net *net;
3300 
3301 	tcp_twsk_purge(net_exit_list, AF_INET);
3302 
3303 	list_for_each_entry(net, net_exit_list, exit_list) {
3304 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3305 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3306 		tcp_fastopen_ctx_destroy(net);
3307 	}
3308 }
3309 
3310 static struct pernet_operations __net_initdata tcp_sk_ops = {
3311        .init	   = tcp_sk_init,
3312        .exit	   = tcp_sk_exit,
3313        .exit_batch = tcp_sk_exit_batch,
3314 };
3315 
3316 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3317 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3318 		     struct sock_common *sk_common, uid_t uid)
3319 
3320 #define INIT_BATCH_SZ 16
3321 
3322 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3323 {
3324 	struct bpf_tcp_iter_state *iter = priv_data;
3325 	int err;
3326 
3327 	err = bpf_iter_init_seq_net(priv_data, aux);
3328 	if (err)
3329 		return err;
3330 
3331 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3332 	if (err) {
3333 		bpf_iter_fini_seq_net(priv_data);
3334 		return err;
3335 	}
3336 
3337 	return 0;
3338 }
3339 
3340 static void bpf_iter_fini_tcp(void *priv_data)
3341 {
3342 	struct bpf_tcp_iter_state *iter = priv_data;
3343 
3344 	bpf_iter_fini_seq_net(priv_data);
3345 	kvfree(iter->batch);
3346 }
3347 
3348 static const struct bpf_iter_seq_info tcp_seq_info = {
3349 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3350 	.init_seq_private	= bpf_iter_init_tcp,
3351 	.fini_seq_private	= bpf_iter_fini_tcp,
3352 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3353 };
3354 
3355 static const struct bpf_func_proto *
3356 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3357 			    const struct bpf_prog *prog)
3358 {
3359 	switch (func_id) {
3360 	case BPF_FUNC_setsockopt:
3361 		return &bpf_sk_setsockopt_proto;
3362 	case BPF_FUNC_getsockopt:
3363 		return &bpf_sk_getsockopt_proto;
3364 	default:
3365 		return NULL;
3366 	}
3367 }
3368 
3369 static struct bpf_iter_reg tcp_reg_info = {
3370 	.target			= "tcp",
3371 	.ctx_arg_info_size	= 1,
3372 	.ctx_arg_info		= {
3373 		{ offsetof(struct bpf_iter__tcp, sk_common),
3374 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3375 	},
3376 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3377 	.seq_info		= &tcp_seq_info,
3378 };
3379 
3380 static void __init bpf_iter_register(void)
3381 {
3382 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3383 	if (bpf_iter_reg_target(&tcp_reg_info))
3384 		pr_warn("Warning: could not register bpf iterator tcp\n");
3385 }
3386 
3387 #endif
3388 
3389 void __init tcp_v4_init(void)
3390 {
3391 	int cpu, res;
3392 
3393 	for_each_possible_cpu(cpu) {
3394 		struct sock *sk;
3395 
3396 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3397 					   IPPROTO_TCP, &init_net);
3398 		if (res)
3399 			panic("Failed to create the TCP control socket.\n");
3400 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3401 
3402 		/* Please enforce IP_DF and IPID==0 for RST and
3403 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3404 		 */
3405 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3406 
3407 		per_cpu(ipv4_tcp_sk, cpu) = sk;
3408 	}
3409 	if (register_pernet_subsys(&tcp_sk_ops))
3410 		panic("Failed to create the TCP control socket.\n");
3411 
3412 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3413 	bpf_iter_register();
3414 #endif
3415 }
3416