xref: /linux/net/ipv4/tcp_ipv4.c (revision 27eddbf3449026a73d6ed52d55b192bfcf526a03)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74 
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 #include <linux/skbuff_ref.h>
83 
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86 
87 #include <trace/events/tcp.h>
88 
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93 
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96 
97 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
99 };
100 
101 static DEFINE_MUTEX(tcp_exit_batch_mutex);
102 
tcp_v4_init_seq(const struct sk_buff * skb)103 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
104 {
105 	return secure_tcp_seq(ip_hdr(skb)->daddr,
106 			      ip_hdr(skb)->saddr,
107 			      tcp_hdr(skb)->dest,
108 			      tcp_hdr(skb)->source);
109 }
110 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)111 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
112 {
113 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
114 }
115 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 {
118 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
119 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
120 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
121 	struct tcp_sock *tp = tcp_sk(sk);
122 	int ts_recent_stamp;
123 	u32 reuse_thresh;
124 
125 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
126 		reuse = 0;
127 
128 	if (reuse == 2) {
129 		/* Still does not detect *everything* that goes through
130 		 * lo, since we require a loopback src or dst address
131 		 * or direct binding to 'lo' interface.
132 		 */
133 		bool loopback = false;
134 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
135 			loopback = true;
136 #if IS_ENABLED(CONFIG_IPV6)
137 		if (tw->tw_family == AF_INET6) {
138 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
139 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
140 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
141 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
142 				loopback = true;
143 		} else
144 #endif
145 		{
146 			if (ipv4_is_loopback(tw->tw_daddr) ||
147 			    ipv4_is_loopback(tw->tw_rcv_saddr))
148 				loopback = true;
149 		}
150 		if (!loopback)
151 			reuse = 0;
152 	}
153 
154 	/* With PAWS, it is safe from the viewpoint
155 	   of data integrity. Even without PAWS it is safe provided sequence
156 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
157 
158 	   Actually, the idea is close to VJ's one, only timestamp cache is
159 	   held not per host, but per port pair and TW bucket is used as state
160 	   holder.
161 
162 	   If TW bucket has been already destroyed we fall back to VJ's scheme
163 	   and use initial timestamp retrieved from peer table.
164 	 */
165 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
166 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
167 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
168 	if (ts_recent_stamp &&
169 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
170 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
171 		 * and releasing the bucket lock.
172 		 */
173 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
174 			return 0;
175 
176 		/* In case of repair and re-using TIME-WAIT sockets we still
177 		 * want to be sure that it is safe as above but honor the
178 		 * sequence numbers and time stamps set as part of the repair
179 		 * process.
180 		 *
181 		 * Without this check re-using a TIME-WAIT socket with TCP
182 		 * repair would accumulate a -1 on the repair assigned
183 		 * sequence number. The first time it is reused the sequence
184 		 * is -1, the second time -2, etc. This fixes that issue
185 		 * without appearing to create any others.
186 		 */
187 		if (likely(!tp->repair)) {
188 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
189 
190 			if (!seq)
191 				seq = 1;
192 			WRITE_ONCE(tp->write_seq, seq);
193 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
194 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
195 		}
196 
197 		return 1;
198 	}
199 
200 	return 0;
201 }
202 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
203 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)204 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
205 			      int addr_len)
206 {
207 	/* This check is replicated from tcp_v4_connect() and intended to
208 	 * prevent BPF program called below from accessing bytes that are out
209 	 * of the bound specified by user in addr_len.
210 	 */
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	sock_owned_by_me(sk);
215 
216 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
217 }
218 
219 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)220 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
221 {
222 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
223 	struct inet_timewait_death_row *tcp_death_row;
224 	struct inet_sock *inet = inet_sk(sk);
225 	struct tcp_sock *tp = tcp_sk(sk);
226 	struct ip_options_rcu *inet_opt;
227 	struct net *net = sock_net(sk);
228 	__be16 orig_sport, orig_dport;
229 	__be32 daddr, nexthop;
230 	struct flowi4 *fl4;
231 	struct rtable *rt;
232 	int err;
233 
234 	if (addr_len < sizeof(struct sockaddr_in))
235 		return -EINVAL;
236 
237 	if (usin->sin_family != AF_INET)
238 		return -EAFNOSUPPORT;
239 
240 	nexthop = daddr = usin->sin_addr.s_addr;
241 	inet_opt = rcu_dereference_protected(inet->inet_opt,
242 					     lockdep_sock_is_held(sk));
243 	if (inet_opt && inet_opt->opt.srr) {
244 		if (!daddr)
245 			return -EINVAL;
246 		nexthop = inet_opt->opt.faddr;
247 	}
248 
249 	orig_sport = inet->inet_sport;
250 	orig_dport = usin->sin_port;
251 	fl4 = &inet->cork.fl.u.ip4;
252 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
253 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
254 			      orig_dport, sk);
255 	if (IS_ERR(rt)) {
256 		err = PTR_ERR(rt);
257 		if (err == -ENETUNREACH)
258 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
259 		return err;
260 	}
261 
262 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
263 		ip_rt_put(rt);
264 		return -ENETUNREACH;
265 	}
266 
267 	if (!inet_opt || !inet_opt->opt.srr)
268 		daddr = fl4->daddr;
269 
270 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
271 
272 	if (!inet->inet_saddr) {
273 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
274 		if (err) {
275 			ip_rt_put(rt);
276 			return err;
277 		}
278 	} else {
279 		sk_rcv_saddr_set(sk, inet->inet_saddr);
280 	}
281 
282 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
283 		/* Reset inherited state */
284 		tp->rx_opt.ts_recent	   = 0;
285 		tp->rx_opt.ts_recent_stamp = 0;
286 		if (likely(!tp->repair))
287 			WRITE_ONCE(tp->write_seq, 0);
288 	}
289 
290 	inet->inet_dport = usin->sin_port;
291 	sk_daddr_set(sk, daddr);
292 
293 	inet_csk(sk)->icsk_ext_hdr_len = 0;
294 	if (inet_opt)
295 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
296 
297 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
298 
299 	/* Socket identity is still unknown (sport may be zero).
300 	 * However we set state to SYN-SENT and not releasing socket
301 	 * lock select source port, enter ourselves into the hash tables and
302 	 * complete initialization after this.
303 	 */
304 	tcp_set_state(sk, TCP_SYN_SENT);
305 	err = inet_hash_connect(tcp_death_row, sk);
306 	if (err)
307 		goto failure;
308 
309 	sk_set_txhash(sk);
310 
311 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
312 			       inet->inet_sport, inet->inet_dport, sk);
313 	if (IS_ERR(rt)) {
314 		err = PTR_ERR(rt);
315 		rt = NULL;
316 		goto failure;
317 	}
318 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
319 	/* OK, now commit destination to socket.  */
320 	sk->sk_gso_type = SKB_GSO_TCPV4;
321 	sk_setup_caps(sk, &rt->dst);
322 	rt = NULL;
323 
324 	if (likely(!tp->repair)) {
325 		if (!tp->write_seq)
326 			WRITE_ONCE(tp->write_seq,
327 				   secure_tcp_seq(inet->inet_saddr,
328 						  inet->inet_daddr,
329 						  inet->inet_sport,
330 						  usin->sin_port));
331 		WRITE_ONCE(tp->tsoffset,
332 			   secure_tcp_ts_off(net, inet->inet_saddr,
333 					     inet->inet_daddr));
334 	}
335 
336 	atomic_set(&inet->inet_id, get_random_u16());
337 
338 	if (tcp_fastopen_defer_connect(sk, &err))
339 		return err;
340 	if (err)
341 		goto failure;
342 
343 	err = tcp_connect(sk);
344 
345 	if (err)
346 		goto failure;
347 
348 	return 0;
349 
350 failure:
351 	/*
352 	 * This unhashes the socket and releases the local port,
353 	 * if necessary.
354 	 */
355 	tcp_set_state(sk, TCP_CLOSE);
356 	inet_bhash2_reset_saddr(sk);
357 	ip_rt_put(rt);
358 	sk->sk_route_caps = 0;
359 	inet->inet_dport = 0;
360 	return err;
361 }
362 EXPORT_SYMBOL(tcp_v4_connect);
363 
364 /*
365  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
366  * It can be called through tcp_release_cb() if socket was owned by user
367  * at the time tcp_v4_err() was called to handle ICMP message.
368  */
tcp_v4_mtu_reduced(struct sock * sk)369 void tcp_v4_mtu_reduced(struct sock *sk)
370 {
371 	struct inet_sock *inet = inet_sk(sk);
372 	struct dst_entry *dst;
373 	u32 mtu;
374 
375 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
376 		return;
377 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
378 	dst = inet_csk_update_pmtu(sk, mtu);
379 	if (!dst)
380 		return;
381 
382 	/* Something is about to be wrong... Remember soft error
383 	 * for the case, if this connection will not able to recover.
384 	 */
385 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
386 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
387 
388 	mtu = dst_mtu(dst);
389 
390 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
391 	    ip_sk_accept_pmtu(sk) &&
392 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
393 		tcp_sync_mss(sk, mtu);
394 
395 		/* Resend the TCP packet because it's
396 		 * clear that the old packet has been
397 		 * dropped. This is the new "fast" path mtu
398 		 * discovery.
399 		 */
400 		tcp_simple_retransmit(sk);
401 	} /* else let the usual retransmit timer handle it */
402 }
403 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
404 
do_redirect(struct sk_buff * skb,struct sock * sk)405 static void do_redirect(struct sk_buff *skb, struct sock *sk)
406 {
407 	struct dst_entry *dst = __sk_dst_check(sk, 0);
408 
409 	if (dst)
410 		dst->ops->redirect(dst, sk, skb);
411 }
412 
413 
414 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)415 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
416 {
417 	struct request_sock *req = inet_reqsk(sk);
418 	struct net *net = sock_net(sk);
419 
420 	/* ICMPs are not backlogged, hence we cannot get
421 	 * an established socket here.
422 	 */
423 	if (seq != tcp_rsk(req)->snt_isn) {
424 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
425 	} else if (abort) {
426 		/*
427 		 * Still in SYN_RECV, just remove it silently.
428 		 * There is no good way to pass the error to the newly
429 		 * created socket, and POSIX does not want network
430 		 * errors returned from accept().
431 		 */
432 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
433 		tcp_listendrop(req->rsk_listener);
434 	}
435 	reqsk_put(req);
436 }
437 EXPORT_SYMBOL(tcp_req_err);
438 
439 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)440 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
441 {
442 	struct inet_connection_sock *icsk = inet_csk(sk);
443 	struct tcp_sock *tp = tcp_sk(sk);
444 	struct sk_buff *skb;
445 	s32 remaining;
446 	u32 delta_us;
447 
448 	if (sock_owned_by_user(sk))
449 		return;
450 
451 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
452 	    !icsk->icsk_backoff)
453 		return;
454 
455 	skb = tcp_rtx_queue_head(sk);
456 	if (WARN_ON_ONCE(!skb))
457 		return;
458 
459 	icsk->icsk_backoff--;
460 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
461 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
462 
463 	tcp_mstamp_refresh(tp);
464 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
465 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
466 
467 	if (remaining > 0) {
468 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
469 					  remaining, TCP_RTO_MAX);
470 	} else {
471 		/* RTO revert clocked out retransmission.
472 		 * Will retransmit now.
473 		 */
474 		tcp_retransmit_timer(sk);
475 	}
476 }
477 EXPORT_SYMBOL(tcp_ld_RTO_revert);
478 
479 /*
480  * This routine is called by the ICMP module when it gets some
481  * sort of error condition.  If err < 0 then the socket should
482  * be closed and the error returned to the user.  If err > 0
483  * it's just the icmp type << 8 | icmp code.  After adjustment
484  * header points to the first 8 bytes of the tcp header.  We need
485  * to find the appropriate port.
486  *
487  * The locking strategy used here is very "optimistic". When
488  * someone else accesses the socket the ICMP is just dropped
489  * and for some paths there is no check at all.
490  * A more general error queue to queue errors for later handling
491  * is probably better.
492  *
493  */
494 
tcp_v4_err(struct sk_buff * skb,u32 info)495 int tcp_v4_err(struct sk_buff *skb, u32 info)
496 {
497 	const struct iphdr *iph = (const struct iphdr *)skb->data;
498 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
499 	struct tcp_sock *tp;
500 	const int type = icmp_hdr(skb)->type;
501 	const int code = icmp_hdr(skb)->code;
502 	struct sock *sk;
503 	struct request_sock *fastopen;
504 	u32 seq, snd_una;
505 	int err;
506 	struct net *net = dev_net(skb->dev);
507 
508 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
509 				       iph->daddr, th->dest, iph->saddr,
510 				       ntohs(th->source), inet_iif(skb), 0);
511 	if (!sk) {
512 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
513 		return -ENOENT;
514 	}
515 	if (sk->sk_state == TCP_TIME_WAIT) {
516 		/* To increase the counter of ignored icmps for TCP-AO */
517 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
518 		inet_twsk_put(inet_twsk(sk));
519 		return 0;
520 	}
521 	seq = ntohl(th->seq);
522 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
523 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
524 				     type == ICMP_TIME_EXCEEDED ||
525 				     (type == ICMP_DEST_UNREACH &&
526 				      (code == ICMP_NET_UNREACH ||
527 				       code == ICMP_HOST_UNREACH)));
528 		return 0;
529 	}
530 
531 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
532 		sock_put(sk);
533 		return 0;
534 	}
535 
536 	bh_lock_sock(sk);
537 	/* If too many ICMPs get dropped on busy
538 	 * servers this needs to be solved differently.
539 	 * We do take care of PMTU discovery (RFC1191) special case :
540 	 * we can receive locally generated ICMP messages while socket is held.
541 	 */
542 	if (sock_owned_by_user(sk)) {
543 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
544 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
545 	}
546 	if (sk->sk_state == TCP_CLOSE)
547 		goto out;
548 
549 	if (static_branch_unlikely(&ip4_min_ttl)) {
550 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
551 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
552 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
553 			goto out;
554 		}
555 	}
556 
557 	tp = tcp_sk(sk);
558 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
559 	fastopen = rcu_dereference(tp->fastopen_rsk);
560 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
561 	if (sk->sk_state != TCP_LISTEN &&
562 	    !between(seq, snd_una, tp->snd_nxt)) {
563 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
564 		goto out;
565 	}
566 
567 	switch (type) {
568 	case ICMP_REDIRECT:
569 		if (!sock_owned_by_user(sk))
570 			do_redirect(skb, sk);
571 		goto out;
572 	case ICMP_SOURCE_QUENCH:
573 		/* Just silently ignore these. */
574 		goto out;
575 	case ICMP_PARAMETERPROB:
576 		err = EPROTO;
577 		break;
578 	case ICMP_DEST_UNREACH:
579 		if (code > NR_ICMP_UNREACH)
580 			goto out;
581 
582 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
583 			/* We are not interested in TCP_LISTEN and open_requests
584 			 * (SYN-ACKs send out by Linux are always <576bytes so
585 			 * they should go through unfragmented).
586 			 */
587 			if (sk->sk_state == TCP_LISTEN)
588 				goto out;
589 
590 			WRITE_ONCE(tp->mtu_info, info);
591 			if (!sock_owned_by_user(sk)) {
592 				tcp_v4_mtu_reduced(sk);
593 			} else {
594 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
595 					sock_hold(sk);
596 			}
597 			goto out;
598 		}
599 
600 		err = icmp_err_convert[code].errno;
601 		/* check if this ICMP message allows revert of backoff.
602 		 * (see RFC 6069)
603 		 */
604 		if (!fastopen &&
605 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
606 			tcp_ld_RTO_revert(sk, seq);
607 		break;
608 	case ICMP_TIME_EXCEEDED:
609 		err = EHOSTUNREACH;
610 		break;
611 	default:
612 		goto out;
613 	}
614 
615 	switch (sk->sk_state) {
616 	case TCP_SYN_SENT:
617 	case TCP_SYN_RECV:
618 		/* Only in fast or simultaneous open. If a fast open socket is
619 		 * already accepted it is treated as a connected one below.
620 		 */
621 		if (fastopen && !fastopen->sk)
622 			break;
623 
624 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
625 
626 		if (!sock_owned_by_user(sk))
627 			tcp_done_with_error(sk, err);
628 		else
629 			WRITE_ONCE(sk->sk_err_soft, err);
630 		goto out;
631 	}
632 
633 	/* If we've already connected we will keep trying
634 	 * until we time out, or the user gives up.
635 	 *
636 	 * rfc1122 4.2.3.9 allows to consider as hard errors
637 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
638 	 * but it is obsoleted by pmtu discovery).
639 	 *
640 	 * Note, that in modern internet, where routing is unreliable
641 	 * and in each dark corner broken firewalls sit, sending random
642 	 * errors ordered by their masters even this two messages finally lose
643 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
644 	 *
645 	 * Now we are in compliance with RFCs.
646 	 *							--ANK (980905)
647 	 */
648 
649 	if (!sock_owned_by_user(sk) &&
650 	    inet_test_bit(RECVERR, sk)) {
651 		WRITE_ONCE(sk->sk_err, err);
652 		sk_error_report(sk);
653 	} else	{ /* Only an error on timeout */
654 		WRITE_ONCE(sk->sk_err_soft, err);
655 	}
656 
657 out:
658 	bh_unlock_sock(sk);
659 	sock_put(sk);
660 	return 0;
661 }
662 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)663 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
664 {
665 	struct tcphdr *th = tcp_hdr(skb);
666 
667 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
668 	skb->csum_start = skb_transport_header(skb) - skb->head;
669 	skb->csum_offset = offsetof(struct tcphdr, check);
670 }
671 
672 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)673 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
674 {
675 	const struct inet_sock *inet = inet_sk(sk);
676 
677 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
678 }
679 EXPORT_SYMBOL(tcp_v4_send_check);
680 
681 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
682 
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])683 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
684 				 const struct tcp_ao_hdr *aoh,
685 				 struct ip_reply_arg *arg, struct tcphdr *reply,
686 				 __be32 reply_options[REPLY_OPTIONS_LEN])
687 {
688 #ifdef CONFIG_TCP_AO
689 	int sdif = tcp_v4_sdif(skb);
690 	int dif = inet_iif(skb);
691 	int l3index = sdif ? dif : 0;
692 	bool allocated_traffic_key;
693 	struct tcp_ao_key *key;
694 	char *traffic_key;
695 	bool drop = true;
696 	u32 ao_sne = 0;
697 	u8 keyid;
698 
699 	rcu_read_lock();
700 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
701 				 &key, &traffic_key, &allocated_traffic_key,
702 				 &keyid, &ao_sne))
703 		goto out;
704 
705 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
706 				 (aoh->rnext_keyid << 8) | keyid);
707 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
708 	reply->doff = arg->iov[0].iov_len / 4;
709 
710 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
711 			    key, traffic_key,
712 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
713 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
714 			    reply, ao_sne))
715 		goto out;
716 	drop = false;
717 out:
718 	rcu_read_unlock();
719 	if (allocated_traffic_key)
720 		kfree(traffic_key);
721 	return drop;
722 #else
723 	return true;
724 #endif
725 }
726 
727 /*
728  *	This routine will send an RST to the other tcp.
729  *
730  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
731  *		      for reset.
732  *	Answer: if a packet caused RST, it is not for a socket
733  *		existing in our system, if it is matched to a socket,
734  *		it is just duplicate segment or bug in other side's TCP.
735  *		So that we build reply only basing on parameters
736  *		arrived with segment.
737  *	Exception: precedence violation. We do not implement it in any case.
738  */
739 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)740 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
741 			      enum sk_rst_reason reason)
742 {
743 	const struct tcphdr *th = tcp_hdr(skb);
744 	struct {
745 		struct tcphdr th;
746 		__be32 opt[REPLY_OPTIONS_LEN];
747 	} rep;
748 	const __u8 *md5_hash_location = NULL;
749 	const struct tcp_ao_hdr *aoh;
750 	struct ip_reply_arg arg;
751 #ifdef CONFIG_TCP_MD5SIG
752 	struct tcp_md5sig_key *key = NULL;
753 	unsigned char newhash[16];
754 	struct sock *sk1 = NULL;
755 	int genhash;
756 #endif
757 	u64 transmit_time = 0;
758 	struct sock *ctl_sk;
759 	struct net *net;
760 	u32 txhash = 0;
761 
762 	/* Never send a reset in response to a reset. */
763 	if (th->rst)
764 		return;
765 
766 	/* If sk not NULL, it means we did a successful lookup and incoming
767 	 * route had to be correct. prequeue might have dropped our dst.
768 	 */
769 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
770 		return;
771 
772 	/* Swap the send and the receive. */
773 	memset(&rep, 0, sizeof(rep));
774 	rep.th.dest   = th->source;
775 	rep.th.source = th->dest;
776 	rep.th.doff   = sizeof(struct tcphdr) / 4;
777 	rep.th.rst    = 1;
778 
779 	if (th->ack) {
780 		rep.th.seq = th->ack_seq;
781 	} else {
782 		rep.th.ack = 1;
783 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
784 				       skb->len - (th->doff << 2));
785 	}
786 
787 	memset(&arg, 0, sizeof(arg));
788 	arg.iov[0].iov_base = (unsigned char *)&rep;
789 	arg.iov[0].iov_len  = sizeof(rep.th);
790 
791 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
792 
793 	/* Invalid TCP option size or twice included auth */
794 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
795 		return;
796 
797 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
798 		return;
799 
800 #ifdef CONFIG_TCP_MD5SIG
801 	rcu_read_lock();
802 	if (sk && sk_fullsock(sk)) {
803 		const union tcp_md5_addr *addr;
804 		int l3index;
805 
806 		/* sdif set, means packet ingressed via a device
807 		 * in an L3 domain and inet_iif is set to it.
808 		 */
809 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
810 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
811 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
812 	} else if (md5_hash_location) {
813 		const union tcp_md5_addr *addr;
814 		int sdif = tcp_v4_sdif(skb);
815 		int dif = inet_iif(skb);
816 		int l3index;
817 
818 		/*
819 		 * active side is lost. Try to find listening socket through
820 		 * source port, and then find md5 key through listening socket.
821 		 * we are not loose security here:
822 		 * Incoming packet is checked with md5 hash with finding key,
823 		 * no RST generated if md5 hash doesn't match.
824 		 */
825 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
826 					     NULL, 0, ip_hdr(skb)->saddr,
827 					     th->source, ip_hdr(skb)->daddr,
828 					     ntohs(th->source), dif, sdif);
829 		/* don't send rst if it can't find key */
830 		if (!sk1)
831 			goto out;
832 
833 		/* sdif set, means packet ingressed via a device
834 		 * in an L3 domain and dif is set to it.
835 		 */
836 		l3index = sdif ? dif : 0;
837 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
838 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
839 		if (!key)
840 			goto out;
841 
842 
843 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
844 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
845 			goto out;
846 
847 	}
848 
849 	if (key) {
850 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
851 				   (TCPOPT_NOP << 16) |
852 				   (TCPOPT_MD5SIG << 8) |
853 				   TCPOLEN_MD5SIG);
854 		/* Update length and the length the header thinks exists */
855 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
856 		rep.th.doff = arg.iov[0].iov_len / 4;
857 
858 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
859 				     key, ip_hdr(skb)->saddr,
860 				     ip_hdr(skb)->daddr, &rep.th);
861 	}
862 #endif
863 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
864 	if (rep.opt[0] == 0) {
865 		__be32 mrst = mptcp_reset_option(skb);
866 
867 		if (mrst) {
868 			rep.opt[0] = mrst;
869 			arg.iov[0].iov_len += sizeof(mrst);
870 			rep.th.doff = arg.iov[0].iov_len / 4;
871 		}
872 	}
873 
874 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
875 				      ip_hdr(skb)->saddr, /* XXX */
876 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
877 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
878 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
879 
880 	/* When socket is gone, all binding information is lost.
881 	 * routing might fail in this case. No choice here, if we choose to force
882 	 * input interface, we will misroute in case of asymmetric route.
883 	 */
884 	if (sk)
885 		arg.bound_dev_if = sk->sk_bound_dev_if;
886 
887 	trace_tcp_send_reset(sk, skb, reason);
888 
889 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
890 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
891 
892 	arg.tos = ip_hdr(skb)->tos;
893 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
894 	local_bh_disable();
895 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
896 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
897 
898 	sock_net_set(ctl_sk, net);
899 	if (sk) {
900 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
901 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
902 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
903 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
904 		transmit_time = tcp_transmit_time(sk);
905 		xfrm_sk_clone_policy(ctl_sk, sk);
906 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
907 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
908 	} else {
909 		ctl_sk->sk_mark = 0;
910 		ctl_sk->sk_priority = 0;
911 	}
912 	ip_send_unicast_reply(ctl_sk, sk,
913 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
914 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
915 			      &arg, arg.iov[0].iov_len,
916 			      transmit_time, txhash);
917 
918 	xfrm_sk_free_policy(ctl_sk);
919 	sock_net_set(ctl_sk, &init_net);
920 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
921 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
922 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
923 	local_bh_enable();
924 
925 #ifdef CONFIG_TCP_MD5SIG
926 out:
927 	rcu_read_unlock();
928 #endif
929 }
930 
931 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
932    outside socket context is ugly, certainly. What can I do?
933  */
934 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)935 static void tcp_v4_send_ack(const struct sock *sk,
936 			    struct sk_buff *skb, u32 seq, u32 ack,
937 			    u32 win, u32 tsval, u32 tsecr, int oif,
938 			    struct tcp_key *key,
939 			    int reply_flags, u8 tos, u32 txhash)
940 {
941 	const struct tcphdr *th = tcp_hdr(skb);
942 	struct {
943 		struct tcphdr th;
944 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
945 	} rep;
946 	struct net *net = sock_net(sk);
947 	struct ip_reply_arg arg;
948 	struct sock *ctl_sk;
949 	u64 transmit_time;
950 
951 	memset(&rep.th, 0, sizeof(struct tcphdr));
952 	memset(&arg, 0, sizeof(arg));
953 
954 	arg.iov[0].iov_base = (unsigned char *)&rep;
955 	arg.iov[0].iov_len  = sizeof(rep.th);
956 	if (tsecr) {
957 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
958 				   (TCPOPT_TIMESTAMP << 8) |
959 				   TCPOLEN_TIMESTAMP);
960 		rep.opt[1] = htonl(tsval);
961 		rep.opt[2] = htonl(tsecr);
962 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
963 	}
964 
965 	/* Swap the send and the receive. */
966 	rep.th.dest    = th->source;
967 	rep.th.source  = th->dest;
968 	rep.th.doff    = arg.iov[0].iov_len / 4;
969 	rep.th.seq     = htonl(seq);
970 	rep.th.ack_seq = htonl(ack);
971 	rep.th.ack     = 1;
972 	rep.th.window  = htons(win);
973 
974 #ifdef CONFIG_TCP_MD5SIG
975 	if (tcp_key_is_md5(key)) {
976 		int offset = (tsecr) ? 3 : 0;
977 
978 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
979 					  (TCPOPT_NOP << 16) |
980 					  (TCPOPT_MD5SIG << 8) |
981 					  TCPOLEN_MD5SIG);
982 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
983 		rep.th.doff = arg.iov[0].iov_len/4;
984 
985 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
986 				    key->md5_key, ip_hdr(skb)->saddr,
987 				    ip_hdr(skb)->daddr, &rep.th);
988 	}
989 #endif
990 #ifdef CONFIG_TCP_AO
991 	if (tcp_key_is_ao(key)) {
992 		int offset = (tsecr) ? 3 : 0;
993 
994 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
995 					  (tcp_ao_len(key->ao_key) << 16) |
996 					  (key->ao_key->sndid << 8) |
997 					  key->rcv_next);
998 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
999 		rep.th.doff = arg.iov[0].iov_len / 4;
1000 
1001 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1002 				key->ao_key, key->traffic_key,
1003 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1004 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1005 				&rep.th, key->sne);
1006 	}
1007 #endif
1008 	arg.flags = reply_flags;
1009 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1010 				      ip_hdr(skb)->saddr, /* XXX */
1011 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1012 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1013 	if (oif)
1014 		arg.bound_dev_if = oif;
1015 	arg.tos = tos;
1016 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1017 	local_bh_disable();
1018 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1019 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1020 	sock_net_set(ctl_sk, net);
1021 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1022 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1023 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1024 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1025 	transmit_time = tcp_transmit_time(sk);
1026 	ip_send_unicast_reply(ctl_sk, sk,
1027 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1028 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1029 			      &arg, arg.iov[0].iov_len,
1030 			      transmit_time, txhash);
1031 
1032 	sock_net_set(ctl_sk, &init_net);
1033 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1034 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1035 	local_bh_enable();
1036 }
1037 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)1038 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1039 {
1040 	struct inet_timewait_sock *tw = inet_twsk(sk);
1041 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1042 	struct tcp_key key = {};
1043 #ifdef CONFIG_TCP_AO
1044 	struct tcp_ao_info *ao_info;
1045 
1046 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1047 		/* FIXME: the segment to-be-acked is not verified yet */
1048 		ao_info = rcu_dereference(tcptw->ao_info);
1049 		if (ao_info) {
1050 			const struct tcp_ao_hdr *aoh;
1051 
1052 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1053 				inet_twsk_put(tw);
1054 				return;
1055 			}
1056 
1057 			if (aoh)
1058 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1059 								    aoh->rnext_keyid, -1);
1060 		}
1061 	}
1062 	if (key.ao_key) {
1063 		struct tcp_ao_key *rnext_key;
1064 
1065 		key.traffic_key = snd_other_key(key.ao_key);
1066 		key.sne = READ_ONCE(ao_info->snd_sne);
1067 		rnext_key = READ_ONCE(ao_info->rnext_key);
1068 		key.rcv_next = rnext_key->rcvid;
1069 		key.type = TCP_KEY_AO;
1070 #else
1071 	if (0) {
1072 #endif
1073 	} else if (static_branch_tcp_md5()) {
1074 		key.md5_key = tcp_twsk_md5_key(tcptw);
1075 		if (key.md5_key)
1076 			key.type = TCP_KEY_MD5;
1077 	}
1078 
1079 	tcp_v4_send_ack(sk, skb,
1080 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1081 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1082 			tcp_tw_tsval(tcptw),
1083 			READ_ONCE(tcptw->tw_ts_recent),
1084 			tw->tw_bound_dev_if, &key,
1085 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1086 			tw->tw_tos,
1087 			tw->tw_txhash);
1088 
1089 	inet_twsk_put(tw);
1090 }
1091 
1092 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1093 				  struct request_sock *req)
1094 {
1095 	struct tcp_key key = {};
1096 
1097 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1098 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1099 	 */
1100 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1101 					     tcp_sk(sk)->snd_nxt;
1102 
1103 #ifdef CONFIG_TCP_AO
1104 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1105 	    tcp_rsk_used_ao(req)) {
1106 		const union tcp_md5_addr *addr;
1107 		const struct tcp_ao_hdr *aoh;
1108 		int l3index;
1109 
1110 		/* Invalid TCP option size or twice included auth */
1111 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1112 			return;
1113 		if (!aoh)
1114 			return;
1115 
1116 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1117 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1118 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1119 					      aoh->rnext_keyid, -1);
1120 		if (unlikely(!key.ao_key)) {
1121 			/* Send ACK with any matching MKT for the peer */
1122 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1123 			/* Matching key disappeared (user removed the key?)
1124 			 * let the handshake timeout.
1125 			 */
1126 			if (!key.ao_key) {
1127 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1128 						     addr,
1129 						     ntohs(tcp_hdr(skb)->source),
1130 						     &ip_hdr(skb)->daddr,
1131 						     ntohs(tcp_hdr(skb)->dest));
1132 				return;
1133 			}
1134 		}
1135 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1136 		if (!key.traffic_key)
1137 			return;
1138 
1139 		key.type = TCP_KEY_AO;
1140 		key.rcv_next = aoh->keyid;
1141 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1142 #else
1143 	if (0) {
1144 #endif
1145 	} else if (static_branch_tcp_md5()) {
1146 		const union tcp_md5_addr *addr;
1147 		int l3index;
1148 
1149 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1150 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1151 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1152 		if (key.md5_key)
1153 			key.type = TCP_KEY_MD5;
1154 	}
1155 
1156 	tcp_v4_send_ack(sk, skb, seq,
1157 			tcp_rsk(req)->rcv_nxt,
1158 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1159 			tcp_rsk_tsval(tcp_rsk(req)),
1160 			READ_ONCE(req->ts_recent),
1161 			0, &key,
1162 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1163 			ip_hdr(skb)->tos,
1164 			READ_ONCE(tcp_rsk(req)->txhash));
1165 	if (tcp_key_is_ao(&key))
1166 		kfree(key.traffic_key);
1167 }
1168 
1169 /*
1170  *	Send a SYN-ACK after having received a SYN.
1171  *	This still operates on a request_sock only, not on a big
1172  *	socket.
1173  */
1174 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1175 			      struct flowi *fl,
1176 			      struct request_sock *req,
1177 			      struct tcp_fastopen_cookie *foc,
1178 			      enum tcp_synack_type synack_type,
1179 			      struct sk_buff *syn_skb)
1180 {
1181 	const struct inet_request_sock *ireq = inet_rsk(req);
1182 	struct flowi4 fl4;
1183 	int err = -1;
1184 	struct sk_buff *skb;
1185 	u8 tos;
1186 
1187 	/* First, grab a route. */
1188 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1189 		return -1;
1190 
1191 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1192 
1193 	if (skb) {
1194 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1195 
1196 		tos = READ_ONCE(inet_sk(sk)->tos);
1197 
1198 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1199 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1200 			      (tos & INET_ECN_MASK);
1201 
1202 		if (!INET_ECN_is_capable(tos) &&
1203 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1204 			tos |= INET_ECN_ECT_0;
1205 
1206 		rcu_read_lock();
1207 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1208 					    ireq->ir_rmt_addr,
1209 					    rcu_dereference(ireq->ireq_opt),
1210 					    tos);
1211 		rcu_read_unlock();
1212 		err = net_xmit_eval(err);
1213 	}
1214 
1215 	return err;
1216 }
1217 
1218 /*
1219  *	IPv4 request_sock destructor.
1220  */
1221 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1222 {
1223 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1224 }
1225 
1226 #ifdef CONFIG_TCP_MD5SIG
1227 /*
1228  * RFC2385 MD5 checksumming requires a mapping of
1229  * IP address->MD5 Key.
1230  * We need to maintain these in the sk structure.
1231  */
1232 
1233 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1234 EXPORT_SYMBOL(tcp_md5_needed);
1235 
1236 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1237 {
1238 	if (!old)
1239 		return true;
1240 
1241 	/* l3index always overrides non-l3index */
1242 	if (old->l3index && new->l3index == 0)
1243 		return false;
1244 	if (old->l3index == 0 && new->l3index)
1245 		return true;
1246 
1247 	return old->prefixlen < new->prefixlen;
1248 }
1249 
1250 /* Find the Key structure for an address.  */
1251 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1252 					   const union tcp_md5_addr *addr,
1253 					   int family, bool any_l3index)
1254 {
1255 	const struct tcp_sock *tp = tcp_sk(sk);
1256 	struct tcp_md5sig_key *key;
1257 	const struct tcp_md5sig_info *md5sig;
1258 	__be32 mask;
1259 	struct tcp_md5sig_key *best_match = NULL;
1260 	bool match;
1261 
1262 	/* caller either holds rcu_read_lock() or socket lock */
1263 	md5sig = rcu_dereference_check(tp->md5sig_info,
1264 				       lockdep_sock_is_held(sk));
1265 	if (!md5sig)
1266 		return NULL;
1267 
1268 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1269 				 lockdep_sock_is_held(sk)) {
1270 		if (key->family != family)
1271 			continue;
1272 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1273 		    key->l3index != l3index)
1274 			continue;
1275 		if (family == AF_INET) {
1276 			mask = inet_make_mask(key->prefixlen);
1277 			match = (key->addr.a4.s_addr & mask) ==
1278 				(addr->a4.s_addr & mask);
1279 #if IS_ENABLED(CONFIG_IPV6)
1280 		} else if (family == AF_INET6) {
1281 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1282 						  key->prefixlen);
1283 #endif
1284 		} else {
1285 			match = false;
1286 		}
1287 
1288 		if (match && better_md5_match(best_match, key))
1289 			best_match = key;
1290 	}
1291 	return best_match;
1292 }
1293 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1294 
1295 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1296 						      const union tcp_md5_addr *addr,
1297 						      int family, u8 prefixlen,
1298 						      int l3index, u8 flags)
1299 {
1300 	const struct tcp_sock *tp = tcp_sk(sk);
1301 	struct tcp_md5sig_key *key;
1302 	unsigned int size = sizeof(struct in_addr);
1303 	const struct tcp_md5sig_info *md5sig;
1304 
1305 	/* caller either holds rcu_read_lock() or socket lock */
1306 	md5sig = rcu_dereference_check(tp->md5sig_info,
1307 				       lockdep_sock_is_held(sk));
1308 	if (!md5sig)
1309 		return NULL;
1310 #if IS_ENABLED(CONFIG_IPV6)
1311 	if (family == AF_INET6)
1312 		size = sizeof(struct in6_addr);
1313 #endif
1314 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1315 				 lockdep_sock_is_held(sk)) {
1316 		if (key->family != family)
1317 			continue;
1318 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1319 			continue;
1320 		if (key->l3index != l3index)
1321 			continue;
1322 		if (!memcmp(&key->addr, addr, size) &&
1323 		    key->prefixlen == prefixlen)
1324 			return key;
1325 	}
1326 	return NULL;
1327 }
1328 
1329 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1330 					 const struct sock *addr_sk)
1331 {
1332 	const union tcp_md5_addr *addr;
1333 	int l3index;
1334 
1335 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1336 						 addr_sk->sk_bound_dev_if);
1337 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1338 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1339 }
1340 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1341 
1342 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1343 {
1344 	struct tcp_sock *tp = tcp_sk(sk);
1345 	struct tcp_md5sig_info *md5sig;
1346 
1347 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1348 	if (!md5sig)
1349 		return -ENOMEM;
1350 
1351 	sk_gso_disable(sk);
1352 	INIT_HLIST_HEAD(&md5sig->head);
1353 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1354 	return 0;
1355 }
1356 
1357 /* This can be called on a newly created socket, from other files */
1358 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1359 			    int family, u8 prefixlen, int l3index, u8 flags,
1360 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1361 {
1362 	/* Add Key to the list */
1363 	struct tcp_md5sig_key *key;
1364 	struct tcp_sock *tp = tcp_sk(sk);
1365 	struct tcp_md5sig_info *md5sig;
1366 
1367 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1368 	if (key) {
1369 		/* Pre-existing entry - just update that one.
1370 		 * Note that the key might be used concurrently.
1371 		 * data_race() is telling kcsan that we do not care of
1372 		 * key mismatches, since changing MD5 key on live flows
1373 		 * can lead to packet drops.
1374 		 */
1375 		data_race(memcpy(key->key, newkey, newkeylen));
1376 
1377 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1378 		 * Also note that a reader could catch new key->keylen value
1379 		 * but old key->key[], this is the reason we use __GFP_ZERO
1380 		 * at sock_kmalloc() time below these lines.
1381 		 */
1382 		WRITE_ONCE(key->keylen, newkeylen);
1383 
1384 		return 0;
1385 	}
1386 
1387 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1388 					   lockdep_sock_is_held(sk));
1389 
1390 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1391 	if (!key)
1392 		return -ENOMEM;
1393 
1394 	memcpy(key->key, newkey, newkeylen);
1395 	key->keylen = newkeylen;
1396 	key->family = family;
1397 	key->prefixlen = prefixlen;
1398 	key->l3index = l3index;
1399 	key->flags = flags;
1400 	memcpy(&key->addr, addr,
1401 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1402 								 sizeof(struct in_addr));
1403 	hlist_add_head_rcu(&key->node, &md5sig->head);
1404 	return 0;
1405 }
1406 
1407 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1408 		   int family, u8 prefixlen, int l3index, u8 flags,
1409 		   const u8 *newkey, u8 newkeylen)
1410 {
1411 	struct tcp_sock *tp = tcp_sk(sk);
1412 
1413 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1414 		if (tcp_md5_alloc_sigpool())
1415 			return -ENOMEM;
1416 
1417 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1418 			tcp_md5_release_sigpool();
1419 			return -ENOMEM;
1420 		}
1421 
1422 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1423 			struct tcp_md5sig_info *md5sig;
1424 
1425 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1426 			rcu_assign_pointer(tp->md5sig_info, NULL);
1427 			kfree_rcu(md5sig, rcu);
1428 			tcp_md5_release_sigpool();
1429 			return -EUSERS;
1430 		}
1431 	}
1432 
1433 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1434 				newkey, newkeylen, GFP_KERNEL);
1435 }
1436 EXPORT_SYMBOL(tcp_md5_do_add);
1437 
1438 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1439 		     int family, u8 prefixlen, int l3index,
1440 		     struct tcp_md5sig_key *key)
1441 {
1442 	struct tcp_sock *tp = tcp_sk(sk);
1443 
1444 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1445 		tcp_md5_add_sigpool();
1446 
1447 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1448 			tcp_md5_release_sigpool();
1449 			return -ENOMEM;
1450 		}
1451 
1452 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1453 			struct tcp_md5sig_info *md5sig;
1454 
1455 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1456 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1457 			rcu_assign_pointer(tp->md5sig_info, NULL);
1458 			kfree_rcu(md5sig, rcu);
1459 			tcp_md5_release_sigpool();
1460 			return -EUSERS;
1461 		}
1462 	}
1463 
1464 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1465 				key->flags, key->key, key->keylen,
1466 				sk_gfp_mask(sk, GFP_ATOMIC));
1467 }
1468 EXPORT_SYMBOL(tcp_md5_key_copy);
1469 
1470 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1471 		   u8 prefixlen, int l3index, u8 flags)
1472 {
1473 	struct tcp_md5sig_key *key;
1474 
1475 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1476 	if (!key)
1477 		return -ENOENT;
1478 	hlist_del_rcu(&key->node);
1479 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1480 	kfree_rcu(key, rcu);
1481 	return 0;
1482 }
1483 EXPORT_SYMBOL(tcp_md5_do_del);
1484 
1485 void tcp_clear_md5_list(struct sock *sk)
1486 {
1487 	struct tcp_sock *tp = tcp_sk(sk);
1488 	struct tcp_md5sig_key *key;
1489 	struct hlist_node *n;
1490 	struct tcp_md5sig_info *md5sig;
1491 
1492 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1493 
1494 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1495 		hlist_del_rcu(&key->node);
1496 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1497 		kfree_rcu(key, rcu);
1498 	}
1499 }
1500 
1501 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1502 				 sockptr_t optval, int optlen)
1503 {
1504 	struct tcp_md5sig cmd;
1505 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1506 	const union tcp_md5_addr *addr;
1507 	u8 prefixlen = 32;
1508 	int l3index = 0;
1509 	bool l3flag;
1510 	u8 flags;
1511 
1512 	if (optlen < sizeof(cmd))
1513 		return -EINVAL;
1514 
1515 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1516 		return -EFAULT;
1517 
1518 	if (sin->sin_family != AF_INET)
1519 		return -EINVAL;
1520 
1521 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1522 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1523 
1524 	if (optname == TCP_MD5SIG_EXT &&
1525 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1526 		prefixlen = cmd.tcpm_prefixlen;
1527 		if (prefixlen > 32)
1528 			return -EINVAL;
1529 	}
1530 
1531 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1532 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1533 		struct net_device *dev;
1534 
1535 		rcu_read_lock();
1536 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1537 		if (dev && netif_is_l3_master(dev))
1538 			l3index = dev->ifindex;
1539 
1540 		rcu_read_unlock();
1541 
1542 		/* ok to reference set/not set outside of rcu;
1543 		 * right now device MUST be an L3 master
1544 		 */
1545 		if (!dev || !l3index)
1546 			return -EINVAL;
1547 	}
1548 
1549 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1550 
1551 	if (!cmd.tcpm_keylen)
1552 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1553 
1554 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1555 		return -EINVAL;
1556 
1557 	/* Don't allow keys for peers that have a matching TCP-AO key.
1558 	 * See the comment in tcp_ao_add_cmd()
1559 	 */
1560 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1561 		return -EKEYREJECTED;
1562 
1563 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1564 			      cmd.tcpm_key, cmd.tcpm_keylen);
1565 }
1566 
1567 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1568 				   __be32 daddr, __be32 saddr,
1569 				   const struct tcphdr *th, int nbytes)
1570 {
1571 	struct tcp4_pseudohdr *bp;
1572 	struct scatterlist sg;
1573 	struct tcphdr *_th;
1574 
1575 	bp = hp->scratch;
1576 	bp->saddr = saddr;
1577 	bp->daddr = daddr;
1578 	bp->pad = 0;
1579 	bp->protocol = IPPROTO_TCP;
1580 	bp->len = cpu_to_be16(nbytes);
1581 
1582 	_th = (struct tcphdr *)(bp + 1);
1583 	memcpy(_th, th, sizeof(*th));
1584 	_th->check = 0;
1585 
1586 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1587 	ahash_request_set_crypt(hp->req, &sg, NULL,
1588 				sizeof(*bp) + sizeof(*th));
1589 	return crypto_ahash_update(hp->req);
1590 }
1591 
1592 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1593 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1594 {
1595 	struct tcp_sigpool hp;
1596 
1597 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1598 		goto clear_hash_nostart;
1599 
1600 	if (crypto_ahash_init(hp.req))
1601 		goto clear_hash;
1602 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1603 		goto clear_hash;
1604 	if (tcp_md5_hash_key(&hp, key))
1605 		goto clear_hash;
1606 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1607 	if (crypto_ahash_final(hp.req))
1608 		goto clear_hash;
1609 
1610 	tcp_sigpool_end(&hp);
1611 	return 0;
1612 
1613 clear_hash:
1614 	tcp_sigpool_end(&hp);
1615 clear_hash_nostart:
1616 	memset(md5_hash, 0, 16);
1617 	return 1;
1618 }
1619 
1620 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1621 			const struct sock *sk,
1622 			const struct sk_buff *skb)
1623 {
1624 	const struct tcphdr *th = tcp_hdr(skb);
1625 	struct tcp_sigpool hp;
1626 	__be32 saddr, daddr;
1627 
1628 	if (sk) { /* valid for establish/request sockets */
1629 		saddr = sk->sk_rcv_saddr;
1630 		daddr = sk->sk_daddr;
1631 	} else {
1632 		const struct iphdr *iph = ip_hdr(skb);
1633 		saddr = iph->saddr;
1634 		daddr = iph->daddr;
1635 	}
1636 
1637 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1638 		goto clear_hash_nostart;
1639 
1640 	if (crypto_ahash_init(hp.req))
1641 		goto clear_hash;
1642 
1643 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1644 		goto clear_hash;
1645 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1646 		goto clear_hash;
1647 	if (tcp_md5_hash_key(&hp, key))
1648 		goto clear_hash;
1649 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1650 	if (crypto_ahash_final(hp.req))
1651 		goto clear_hash;
1652 
1653 	tcp_sigpool_end(&hp);
1654 	return 0;
1655 
1656 clear_hash:
1657 	tcp_sigpool_end(&hp);
1658 clear_hash_nostart:
1659 	memset(md5_hash, 0, 16);
1660 	return 1;
1661 }
1662 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1663 
1664 #endif
1665 
1666 static void tcp_v4_init_req(struct request_sock *req,
1667 			    const struct sock *sk_listener,
1668 			    struct sk_buff *skb)
1669 {
1670 	struct inet_request_sock *ireq = inet_rsk(req);
1671 	struct net *net = sock_net(sk_listener);
1672 
1673 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1674 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1675 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1676 }
1677 
1678 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1679 					  struct sk_buff *skb,
1680 					  struct flowi *fl,
1681 					  struct request_sock *req,
1682 					  u32 tw_isn)
1683 {
1684 	tcp_v4_init_req(req, sk, skb);
1685 
1686 	if (security_inet_conn_request(sk, skb, req))
1687 		return NULL;
1688 
1689 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1690 }
1691 
1692 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1693 	.family		=	PF_INET,
1694 	.obj_size	=	sizeof(struct tcp_request_sock),
1695 	.rtx_syn_ack	=	tcp_rtx_synack,
1696 	.send_ack	=	tcp_v4_reqsk_send_ack,
1697 	.destructor	=	tcp_v4_reqsk_destructor,
1698 	.send_reset	=	tcp_v4_send_reset,
1699 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1700 };
1701 
1702 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1703 	.mss_clamp	=	TCP_MSS_DEFAULT,
1704 #ifdef CONFIG_TCP_MD5SIG
1705 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1706 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1707 #endif
1708 #ifdef CONFIG_TCP_AO
1709 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1710 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1711 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1712 #endif
1713 #ifdef CONFIG_SYN_COOKIES
1714 	.cookie_init_seq =	cookie_v4_init_sequence,
1715 #endif
1716 	.route_req	=	tcp_v4_route_req,
1717 	.init_seq	=	tcp_v4_init_seq,
1718 	.init_ts_off	=	tcp_v4_init_ts_off,
1719 	.send_synack	=	tcp_v4_send_synack,
1720 };
1721 
1722 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1723 {
1724 	/* Never answer to SYNs send to broadcast or multicast */
1725 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1726 		goto drop;
1727 
1728 	return tcp_conn_request(&tcp_request_sock_ops,
1729 				&tcp_request_sock_ipv4_ops, sk, skb);
1730 
1731 drop:
1732 	tcp_listendrop(sk);
1733 	return 0;
1734 }
1735 EXPORT_SYMBOL(tcp_v4_conn_request);
1736 
1737 
1738 /*
1739  * The three way handshake has completed - we got a valid synack -
1740  * now create the new socket.
1741  */
1742 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1743 				  struct request_sock *req,
1744 				  struct dst_entry *dst,
1745 				  struct request_sock *req_unhash,
1746 				  bool *own_req)
1747 {
1748 	struct inet_request_sock *ireq;
1749 	bool found_dup_sk = false;
1750 	struct inet_sock *newinet;
1751 	struct tcp_sock *newtp;
1752 	struct sock *newsk;
1753 #ifdef CONFIG_TCP_MD5SIG
1754 	const union tcp_md5_addr *addr;
1755 	struct tcp_md5sig_key *key;
1756 	int l3index;
1757 #endif
1758 	struct ip_options_rcu *inet_opt;
1759 
1760 	if (sk_acceptq_is_full(sk))
1761 		goto exit_overflow;
1762 
1763 	newsk = tcp_create_openreq_child(sk, req, skb);
1764 	if (!newsk)
1765 		goto exit_nonewsk;
1766 
1767 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1768 	inet_sk_rx_dst_set(newsk, skb);
1769 
1770 	newtp		      = tcp_sk(newsk);
1771 	newinet		      = inet_sk(newsk);
1772 	ireq		      = inet_rsk(req);
1773 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1774 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1775 	newsk->sk_bound_dev_if = ireq->ir_iif;
1776 	newinet->inet_saddr   = ireq->ir_loc_addr;
1777 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1778 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1779 	newinet->mc_index     = inet_iif(skb);
1780 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1781 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1782 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1783 	if (inet_opt)
1784 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1785 	atomic_set(&newinet->inet_id, get_random_u16());
1786 
1787 	/* Set ToS of the new socket based upon the value of incoming SYN.
1788 	 * ECT bits are set later in tcp_init_transfer().
1789 	 */
1790 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1791 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1792 
1793 	if (!dst) {
1794 		dst = inet_csk_route_child_sock(sk, newsk, req);
1795 		if (!dst)
1796 			goto put_and_exit;
1797 	} else {
1798 		/* syncookie case : see end of cookie_v4_check() */
1799 	}
1800 	sk_setup_caps(newsk, dst);
1801 
1802 	tcp_ca_openreq_child(newsk, dst);
1803 
1804 	tcp_sync_mss(newsk, dst_mtu(dst));
1805 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1806 
1807 	tcp_initialize_rcv_mss(newsk);
1808 
1809 #ifdef CONFIG_TCP_MD5SIG
1810 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1811 	/* Copy over the MD5 key from the original socket */
1812 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1813 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1814 	if (key && !tcp_rsk_used_ao(req)) {
1815 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1816 			goto put_and_exit;
1817 		sk_gso_disable(newsk);
1818 	}
1819 #endif
1820 #ifdef CONFIG_TCP_AO
1821 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1822 		goto put_and_exit; /* OOM, release back memory */
1823 #endif
1824 
1825 	if (__inet_inherit_port(sk, newsk) < 0)
1826 		goto put_and_exit;
1827 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1828 				       &found_dup_sk);
1829 	if (likely(*own_req)) {
1830 		tcp_move_syn(newtp, req);
1831 		ireq->ireq_opt = NULL;
1832 	} else {
1833 		newinet->inet_opt = NULL;
1834 
1835 		if (!req_unhash && found_dup_sk) {
1836 			/* This code path should only be executed in the
1837 			 * syncookie case only
1838 			 */
1839 			bh_unlock_sock(newsk);
1840 			sock_put(newsk);
1841 			newsk = NULL;
1842 		}
1843 	}
1844 	return newsk;
1845 
1846 exit_overflow:
1847 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1848 exit_nonewsk:
1849 	dst_release(dst);
1850 exit:
1851 	tcp_listendrop(sk);
1852 	return NULL;
1853 put_and_exit:
1854 	newinet->inet_opt = NULL;
1855 	inet_csk_prepare_forced_close(newsk);
1856 	tcp_done(newsk);
1857 	goto exit;
1858 }
1859 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1860 
1861 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1862 {
1863 #ifdef CONFIG_SYN_COOKIES
1864 	const struct tcphdr *th = tcp_hdr(skb);
1865 
1866 	if (!th->syn)
1867 		sk = cookie_v4_check(sk, skb);
1868 #endif
1869 	return sk;
1870 }
1871 
1872 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1873 			 struct tcphdr *th, u32 *cookie)
1874 {
1875 	u16 mss = 0;
1876 #ifdef CONFIG_SYN_COOKIES
1877 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1878 				    &tcp_request_sock_ipv4_ops, sk, th);
1879 	if (mss) {
1880 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1881 		tcp_synq_overflow(sk);
1882 	}
1883 #endif
1884 	return mss;
1885 }
1886 
1887 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1888 							   u32));
1889 /* The socket must have it's spinlock held when we get
1890  * here, unless it is a TCP_LISTEN socket.
1891  *
1892  * We have a potential double-lock case here, so even when
1893  * doing backlog processing we use the BH locking scheme.
1894  * This is because we cannot sleep with the original spinlock
1895  * held.
1896  */
1897 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1898 {
1899 	enum skb_drop_reason reason;
1900 	struct sock *rsk;
1901 
1902 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1903 		struct dst_entry *dst;
1904 
1905 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1906 						lockdep_sock_is_held(sk));
1907 
1908 		sock_rps_save_rxhash(sk, skb);
1909 		sk_mark_napi_id(sk, skb);
1910 		if (dst) {
1911 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1912 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1913 					     dst, 0)) {
1914 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1915 				dst_release(dst);
1916 			}
1917 		}
1918 		tcp_rcv_established(sk, skb);
1919 		return 0;
1920 	}
1921 
1922 	if (tcp_checksum_complete(skb))
1923 		goto csum_err;
1924 
1925 	if (sk->sk_state == TCP_LISTEN) {
1926 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1927 
1928 		if (!nsk)
1929 			return 0;
1930 		if (nsk != sk) {
1931 			reason = tcp_child_process(sk, nsk, skb);
1932 			if (reason) {
1933 				rsk = nsk;
1934 				goto reset;
1935 			}
1936 			return 0;
1937 		}
1938 	} else
1939 		sock_rps_save_rxhash(sk, skb);
1940 
1941 	reason = tcp_rcv_state_process(sk, skb);
1942 	if (reason) {
1943 		rsk = sk;
1944 		goto reset;
1945 	}
1946 	return 0;
1947 
1948 reset:
1949 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1950 discard:
1951 	sk_skb_reason_drop(sk, skb, reason);
1952 	/* Be careful here. If this function gets more complicated and
1953 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1954 	 * might be destroyed here. This current version compiles correctly,
1955 	 * but you have been warned.
1956 	 */
1957 	return 0;
1958 
1959 csum_err:
1960 	reason = SKB_DROP_REASON_TCP_CSUM;
1961 	trace_tcp_bad_csum(skb);
1962 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1963 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1964 	goto discard;
1965 }
1966 EXPORT_SYMBOL(tcp_v4_do_rcv);
1967 
1968 int tcp_v4_early_demux(struct sk_buff *skb)
1969 {
1970 	struct net *net = dev_net(skb->dev);
1971 	const struct iphdr *iph;
1972 	const struct tcphdr *th;
1973 	struct sock *sk;
1974 
1975 	if (skb->pkt_type != PACKET_HOST)
1976 		return 0;
1977 
1978 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1979 		return 0;
1980 
1981 	iph = ip_hdr(skb);
1982 	th = tcp_hdr(skb);
1983 
1984 	if (th->doff < sizeof(struct tcphdr) / 4)
1985 		return 0;
1986 
1987 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1988 				       iph->saddr, th->source,
1989 				       iph->daddr, ntohs(th->dest),
1990 				       skb->skb_iif, inet_sdif(skb));
1991 	if (sk) {
1992 		skb->sk = sk;
1993 		skb->destructor = sock_edemux;
1994 		if (sk_fullsock(sk)) {
1995 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1996 
1997 			if (dst)
1998 				dst = dst_check(dst, 0);
1999 			if (dst &&
2000 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
2001 				skb_dst_set_noref(skb, dst);
2002 		}
2003 	}
2004 	return 0;
2005 }
2006 
2007 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2008 		     enum skb_drop_reason *reason)
2009 {
2010 	u32 tail_gso_size, tail_gso_segs;
2011 	struct skb_shared_info *shinfo;
2012 	const struct tcphdr *th;
2013 	struct tcphdr *thtail;
2014 	struct sk_buff *tail;
2015 	unsigned int hdrlen;
2016 	bool fragstolen;
2017 	u32 gso_segs;
2018 	u32 gso_size;
2019 	u64 limit;
2020 	int delta;
2021 
2022 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2023 	 * we can fix skb->truesize to its real value to avoid future drops.
2024 	 * This is valid because skb is not yet charged to the socket.
2025 	 * It has been noticed pure SACK packets were sometimes dropped
2026 	 * (if cooked by drivers without copybreak feature).
2027 	 */
2028 	skb_condense(skb);
2029 
2030 	tcp_cleanup_skb(skb);
2031 
2032 	if (unlikely(tcp_checksum_complete(skb))) {
2033 		bh_unlock_sock(sk);
2034 		trace_tcp_bad_csum(skb);
2035 		*reason = SKB_DROP_REASON_TCP_CSUM;
2036 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2037 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2038 		return true;
2039 	}
2040 
2041 	/* Attempt coalescing to last skb in backlog, even if we are
2042 	 * above the limits.
2043 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2044 	 */
2045 	th = (const struct tcphdr *)skb->data;
2046 	hdrlen = th->doff * 4;
2047 
2048 	tail = sk->sk_backlog.tail;
2049 	if (!tail)
2050 		goto no_coalesce;
2051 	thtail = (struct tcphdr *)tail->data;
2052 
2053 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2054 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2055 	    ((TCP_SKB_CB(tail)->tcp_flags |
2056 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2057 	    !((TCP_SKB_CB(tail)->tcp_flags &
2058 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2059 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2060 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2061 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2062 	    thtail->doff != th->doff ||
2063 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2064 		goto no_coalesce;
2065 
2066 	__skb_pull(skb, hdrlen);
2067 
2068 	shinfo = skb_shinfo(skb);
2069 	gso_size = shinfo->gso_size ?: skb->len;
2070 	gso_segs = shinfo->gso_segs ?: 1;
2071 
2072 	shinfo = skb_shinfo(tail);
2073 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2074 	tail_gso_segs = shinfo->gso_segs ?: 1;
2075 
2076 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2077 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2078 
2079 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2080 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2081 			thtail->window = th->window;
2082 		}
2083 
2084 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2085 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2086 		 * is not entered if we append a packet with a FIN.
2087 		 * SYN, RST, URG are not present.
2088 		 * ACK is set on both packets.
2089 		 * PSH : we do not really care in TCP stack,
2090 		 *       at least for 'GRO' packets.
2091 		 */
2092 		thtail->fin |= th->fin;
2093 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2094 
2095 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2096 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2097 			tail->tstamp = skb->tstamp;
2098 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2099 		}
2100 
2101 		/* Not as strict as GRO. We only need to carry mss max value */
2102 		shinfo->gso_size = max(gso_size, tail_gso_size);
2103 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2104 
2105 		sk->sk_backlog.len += delta;
2106 		__NET_INC_STATS(sock_net(sk),
2107 				LINUX_MIB_TCPBACKLOGCOALESCE);
2108 		kfree_skb_partial(skb, fragstolen);
2109 		return false;
2110 	}
2111 	__skb_push(skb, hdrlen);
2112 
2113 no_coalesce:
2114 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2115 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2116 	 * sk_rcvbuf in normal conditions.
2117 	 */
2118 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2119 
2120 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2121 
2122 	/* Only socket owner can try to collapse/prune rx queues
2123 	 * to reduce memory overhead, so add a little headroom here.
2124 	 * Few sockets backlog are possibly concurrently non empty.
2125 	 */
2126 	limit += 64 * 1024;
2127 
2128 	limit = min_t(u64, limit, UINT_MAX);
2129 
2130 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2131 		bh_unlock_sock(sk);
2132 		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2133 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2134 		return true;
2135 	}
2136 	return false;
2137 }
2138 EXPORT_SYMBOL(tcp_add_backlog);
2139 
2140 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2141 {
2142 	struct tcphdr *th = (struct tcphdr *)skb->data;
2143 
2144 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
2145 }
2146 EXPORT_SYMBOL(tcp_filter);
2147 
2148 static void tcp_v4_restore_cb(struct sk_buff *skb)
2149 {
2150 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2151 		sizeof(struct inet_skb_parm));
2152 }
2153 
2154 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2155 			   const struct tcphdr *th)
2156 {
2157 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2158 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2159 	 */
2160 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2161 		sizeof(struct inet_skb_parm));
2162 	barrier();
2163 
2164 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2165 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2166 				    skb->len - th->doff * 4);
2167 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2168 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2169 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2170 	TCP_SKB_CB(skb)->sacked	 = 0;
2171 	TCP_SKB_CB(skb)->has_rxtstamp =
2172 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2173 }
2174 
2175 /*
2176  *	From tcp_input.c
2177  */
2178 
2179 int tcp_v4_rcv(struct sk_buff *skb)
2180 {
2181 	struct net *net = dev_net(skb->dev);
2182 	enum skb_drop_reason drop_reason;
2183 	int sdif = inet_sdif(skb);
2184 	int dif = inet_iif(skb);
2185 	const struct iphdr *iph;
2186 	const struct tcphdr *th;
2187 	struct sock *sk = NULL;
2188 	bool refcounted;
2189 	int ret;
2190 	u32 isn;
2191 
2192 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2193 	if (skb->pkt_type != PACKET_HOST)
2194 		goto discard_it;
2195 
2196 	/* Count it even if it's bad */
2197 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2198 
2199 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2200 		goto discard_it;
2201 
2202 	th = (const struct tcphdr *)skb->data;
2203 
2204 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2205 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2206 		goto bad_packet;
2207 	}
2208 	if (!pskb_may_pull(skb, th->doff * 4))
2209 		goto discard_it;
2210 
2211 	/* An explanation is required here, I think.
2212 	 * Packet length and doff are validated by header prediction,
2213 	 * provided case of th->doff==0 is eliminated.
2214 	 * So, we defer the checks. */
2215 
2216 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2217 		goto csum_error;
2218 
2219 	th = (const struct tcphdr *)skb->data;
2220 	iph = ip_hdr(skb);
2221 lookup:
2222 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2223 			       skb, __tcp_hdrlen(th), th->source,
2224 			       th->dest, sdif, &refcounted);
2225 	if (!sk)
2226 		goto no_tcp_socket;
2227 
2228 	if (sk->sk_state == TCP_TIME_WAIT)
2229 		goto do_time_wait;
2230 
2231 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2232 		struct request_sock *req = inet_reqsk(sk);
2233 		bool req_stolen = false;
2234 		struct sock *nsk;
2235 
2236 		sk = req->rsk_listener;
2237 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2238 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2239 		else
2240 			drop_reason = tcp_inbound_hash(sk, req, skb,
2241 						       &iph->saddr, &iph->daddr,
2242 						       AF_INET, dif, sdif);
2243 		if (unlikely(drop_reason)) {
2244 			sk_drops_add(sk, skb);
2245 			reqsk_put(req);
2246 			goto discard_it;
2247 		}
2248 		if (tcp_checksum_complete(skb)) {
2249 			reqsk_put(req);
2250 			goto csum_error;
2251 		}
2252 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2253 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2254 			if (!nsk) {
2255 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2256 				goto lookup;
2257 			}
2258 			sk = nsk;
2259 			/* reuseport_migrate_sock() has already held one sk_refcnt
2260 			 * before returning.
2261 			 */
2262 		} else {
2263 			/* We own a reference on the listener, increase it again
2264 			 * as we might lose it too soon.
2265 			 */
2266 			sock_hold(sk);
2267 		}
2268 		refcounted = true;
2269 		nsk = NULL;
2270 		if (!tcp_filter(sk, skb)) {
2271 			th = (const struct tcphdr *)skb->data;
2272 			iph = ip_hdr(skb);
2273 			tcp_v4_fill_cb(skb, iph, th);
2274 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2275 		} else {
2276 			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2277 		}
2278 		if (!nsk) {
2279 			reqsk_put(req);
2280 			if (req_stolen) {
2281 				/* Another cpu got exclusive access to req
2282 				 * and created a full blown socket.
2283 				 * Try to feed this packet to this socket
2284 				 * instead of discarding it.
2285 				 */
2286 				tcp_v4_restore_cb(skb);
2287 				sock_put(sk);
2288 				goto lookup;
2289 			}
2290 			goto discard_and_relse;
2291 		}
2292 		nf_reset_ct(skb);
2293 		if (nsk == sk) {
2294 			reqsk_put(req);
2295 			tcp_v4_restore_cb(skb);
2296 		} else {
2297 			drop_reason = tcp_child_process(sk, nsk, skb);
2298 			if (drop_reason) {
2299 				enum sk_rst_reason rst_reason;
2300 
2301 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2302 				tcp_v4_send_reset(nsk, skb, rst_reason);
2303 				goto discard_and_relse;
2304 			}
2305 			sock_put(sk);
2306 			return 0;
2307 		}
2308 	}
2309 
2310 process:
2311 	if (static_branch_unlikely(&ip4_min_ttl)) {
2312 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2313 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2314 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2315 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2316 			goto discard_and_relse;
2317 		}
2318 	}
2319 
2320 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2321 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2322 		goto discard_and_relse;
2323 	}
2324 
2325 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2326 				       AF_INET, dif, sdif);
2327 	if (drop_reason)
2328 		goto discard_and_relse;
2329 
2330 	nf_reset_ct(skb);
2331 
2332 	if (tcp_filter(sk, skb)) {
2333 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2334 		goto discard_and_relse;
2335 	}
2336 	th = (const struct tcphdr *)skb->data;
2337 	iph = ip_hdr(skb);
2338 	tcp_v4_fill_cb(skb, iph, th);
2339 
2340 	skb->dev = NULL;
2341 
2342 	if (sk->sk_state == TCP_LISTEN) {
2343 		ret = tcp_v4_do_rcv(sk, skb);
2344 		goto put_and_return;
2345 	}
2346 
2347 	sk_incoming_cpu_update(sk);
2348 
2349 	bh_lock_sock_nested(sk);
2350 	tcp_segs_in(tcp_sk(sk), skb);
2351 	ret = 0;
2352 	if (!sock_owned_by_user(sk)) {
2353 		ret = tcp_v4_do_rcv(sk, skb);
2354 	} else {
2355 		if (tcp_add_backlog(sk, skb, &drop_reason))
2356 			goto discard_and_relse;
2357 	}
2358 	bh_unlock_sock(sk);
2359 
2360 put_and_return:
2361 	if (refcounted)
2362 		sock_put(sk);
2363 
2364 	return ret;
2365 
2366 no_tcp_socket:
2367 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2368 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2369 		goto discard_it;
2370 
2371 	tcp_v4_fill_cb(skb, iph, th);
2372 
2373 	if (tcp_checksum_complete(skb)) {
2374 csum_error:
2375 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2376 		trace_tcp_bad_csum(skb);
2377 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2378 bad_packet:
2379 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2380 	} else {
2381 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2382 	}
2383 
2384 discard_it:
2385 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2386 	/* Discard frame. */
2387 	sk_skb_reason_drop(sk, skb, drop_reason);
2388 	return 0;
2389 
2390 discard_and_relse:
2391 	sk_drops_add(sk, skb);
2392 	if (refcounted)
2393 		sock_put(sk);
2394 	goto discard_it;
2395 
2396 do_time_wait:
2397 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2398 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2399 		inet_twsk_put(inet_twsk(sk));
2400 		goto discard_it;
2401 	}
2402 
2403 	tcp_v4_fill_cb(skb, iph, th);
2404 
2405 	if (tcp_checksum_complete(skb)) {
2406 		inet_twsk_put(inet_twsk(sk));
2407 		goto csum_error;
2408 	}
2409 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2410 	case TCP_TW_SYN: {
2411 		struct sock *sk2 = inet_lookup_listener(net,
2412 							net->ipv4.tcp_death_row.hashinfo,
2413 							skb, __tcp_hdrlen(th),
2414 							iph->saddr, th->source,
2415 							iph->daddr, th->dest,
2416 							inet_iif(skb),
2417 							sdif);
2418 		if (sk2) {
2419 			inet_twsk_deschedule_put(inet_twsk(sk));
2420 			sk = sk2;
2421 			tcp_v4_restore_cb(skb);
2422 			refcounted = false;
2423 			__this_cpu_write(tcp_tw_isn, isn);
2424 			goto process;
2425 		}
2426 	}
2427 		/* to ACK */
2428 		fallthrough;
2429 	case TCP_TW_ACK:
2430 		tcp_v4_timewait_ack(sk, skb);
2431 		break;
2432 	case TCP_TW_RST:
2433 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2434 		inet_twsk_deschedule_put(inet_twsk(sk));
2435 		goto discard_it;
2436 	case TCP_TW_SUCCESS:;
2437 	}
2438 	goto discard_it;
2439 }
2440 
2441 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2442 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2443 	.twsk_destructor= tcp_twsk_destructor,
2444 };
2445 
2446 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2447 {
2448 	struct dst_entry *dst = skb_dst(skb);
2449 
2450 	if (dst && dst_hold_safe(dst)) {
2451 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2452 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2453 	}
2454 }
2455 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2456 
2457 const struct inet_connection_sock_af_ops ipv4_specific = {
2458 	.queue_xmit	   = ip_queue_xmit,
2459 	.send_check	   = tcp_v4_send_check,
2460 	.rebuild_header	   = inet_sk_rebuild_header,
2461 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2462 	.conn_request	   = tcp_v4_conn_request,
2463 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2464 	.net_header_len	   = sizeof(struct iphdr),
2465 	.setsockopt	   = ip_setsockopt,
2466 	.getsockopt	   = ip_getsockopt,
2467 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2468 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2469 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2470 };
2471 EXPORT_SYMBOL(ipv4_specific);
2472 
2473 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2474 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2475 #ifdef CONFIG_TCP_MD5SIG
2476 	.md5_lookup		= tcp_v4_md5_lookup,
2477 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2478 	.md5_parse		= tcp_v4_parse_md5_keys,
2479 #endif
2480 #ifdef CONFIG_TCP_AO
2481 	.ao_lookup		= tcp_v4_ao_lookup,
2482 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2483 	.ao_parse		= tcp_v4_parse_ao,
2484 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2485 #endif
2486 };
2487 #endif
2488 
2489 /* NOTE: A lot of things set to zero explicitly by call to
2490  *       sk_alloc() so need not be done here.
2491  */
2492 static int tcp_v4_init_sock(struct sock *sk)
2493 {
2494 	struct inet_connection_sock *icsk = inet_csk(sk);
2495 
2496 	tcp_init_sock(sk);
2497 
2498 	icsk->icsk_af_ops = &ipv4_specific;
2499 
2500 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2501 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2502 #endif
2503 
2504 	return 0;
2505 }
2506 
2507 #ifdef CONFIG_TCP_MD5SIG
2508 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2509 {
2510 	struct tcp_md5sig_info *md5sig;
2511 
2512 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2513 	kfree(md5sig);
2514 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2515 	tcp_md5_release_sigpool();
2516 }
2517 #endif
2518 
2519 static void tcp_release_user_frags(struct sock *sk)
2520 {
2521 #ifdef CONFIG_PAGE_POOL
2522 	unsigned long index;
2523 	void *netmem;
2524 
2525 	xa_for_each(&sk->sk_user_frags, index, netmem)
2526 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2527 #endif
2528 }
2529 
2530 void tcp_v4_destroy_sock(struct sock *sk)
2531 {
2532 	struct tcp_sock *tp = tcp_sk(sk);
2533 
2534 	tcp_release_user_frags(sk);
2535 
2536 	xa_destroy(&sk->sk_user_frags);
2537 
2538 	trace_tcp_destroy_sock(sk);
2539 
2540 	tcp_clear_xmit_timers(sk);
2541 
2542 	tcp_cleanup_congestion_control(sk);
2543 
2544 	tcp_cleanup_ulp(sk);
2545 
2546 	/* Cleanup up the write buffer. */
2547 	tcp_write_queue_purge(sk);
2548 
2549 	/* Check if we want to disable active TFO */
2550 	tcp_fastopen_active_disable_ofo_check(sk);
2551 
2552 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2553 	skb_rbtree_purge(&tp->out_of_order_queue);
2554 
2555 #ifdef CONFIG_TCP_MD5SIG
2556 	/* Clean up the MD5 key list, if any */
2557 	if (tp->md5sig_info) {
2558 		struct tcp_md5sig_info *md5sig;
2559 
2560 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2561 		tcp_clear_md5_list(sk);
2562 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2563 		rcu_assign_pointer(tp->md5sig_info, NULL);
2564 	}
2565 #endif
2566 	tcp_ao_destroy_sock(sk, false);
2567 
2568 	/* Clean up a referenced TCP bind bucket. */
2569 	if (inet_csk(sk)->icsk_bind_hash)
2570 		inet_put_port(sk);
2571 
2572 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2573 
2574 	/* If socket is aborted during connect operation */
2575 	tcp_free_fastopen_req(tp);
2576 	tcp_fastopen_destroy_cipher(sk);
2577 	tcp_saved_syn_free(tp);
2578 
2579 	sk_sockets_allocated_dec(sk);
2580 }
2581 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2582 
2583 #ifdef CONFIG_PROC_FS
2584 /* Proc filesystem TCP sock list dumping. */
2585 
2586 static unsigned short seq_file_family(const struct seq_file *seq);
2587 
2588 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2589 {
2590 	unsigned short family = seq_file_family(seq);
2591 
2592 	/* AF_UNSPEC is used as a match all */
2593 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2594 		net_eq(sock_net(sk), seq_file_net(seq)));
2595 }
2596 
2597 /* Find a non empty bucket (starting from st->bucket)
2598  * and return the first sk from it.
2599  */
2600 static void *listening_get_first(struct seq_file *seq)
2601 {
2602 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2603 	struct tcp_iter_state *st = seq->private;
2604 
2605 	st->offset = 0;
2606 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2607 		struct inet_listen_hashbucket *ilb2;
2608 		struct hlist_nulls_node *node;
2609 		struct sock *sk;
2610 
2611 		ilb2 = &hinfo->lhash2[st->bucket];
2612 		if (hlist_nulls_empty(&ilb2->nulls_head))
2613 			continue;
2614 
2615 		spin_lock(&ilb2->lock);
2616 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2617 			if (seq_sk_match(seq, sk))
2618 				return sk;
2619 		}
2620 		spin_unlock(&ilb2->lock);
2621 	}
2622 
2623 	return NULL;
2624 }
2625 
2626 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2627  * If "cur" is the last one in the st->bucket,
2628  * call listening_get_first() to return the first sk of the next
2629  * non empty bucket.
2630  */
2631 static void *listening_get_next(struct seq_file *seq, void *cur)
2632 {
2633 	struct tcp_iter_state *st = seq->private;
2634 	struct inet_listen_hashbucket *ilb2;
2635 	struct hlist_nulls_node *node;
2636 	struct inet_hashinfo *hinfo;
2637 	struct sock *sk = cur;
2638 
2639 	++st->num;
2640 	++st->offset;
2641 
2642 	sk = sk_nulls_next(sk);
2643 	sk_nulls_for_each_from(sk, node) {
2644 		if (seq_sk_match(seq, sk))
2645 			return sk;
2646 	}
2647 
2648 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2649 	ilb2 = &hinfo->lhash2[st->bucket];
2650 	spin_unlock(&ilb2->lock);
2651 	++st->bucket;
2652 	return listening_get_first(seq);
2653 }
2654 
2655 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2656 {
2657 	struct tcp_iter_state *st = seq->private;
2658 	void *rc;
2659 
2660 	st->bucket = 0;
2661 	st->offset = 0;
2662 	rc = listening_get_first(seq);
2663 
2664 	while (rc && *pos) {
2665 		rc = listening_get_next(seq, rc);
2666 		--*pos;
2667 	}
2668 	return rc;
2669 }
2670 
2671 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2672 				const struct tcp_iter_state *st)
2673 {
2674 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2675 }
2676 
2677 /*
2678  * Get first established socket starting from bucket given in st->bucket.
2679  * If st->bucket is zero, the very first socket in the hash is returned.
2680  */
2681 static void *established_get_first(struct seq_file *seq)
2682 {
2683 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2684 	struct tcp_iter_state *st = seq->private;
2685 
2686 	st->offset = 0;
2687 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2688 		struct sock *sk;
2689 		struct hlist_nulls_node *node;
2690 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2691 
2692 		cond_resched();
2693 
2694 		/* Lockless fast path for the common case of empty buckets */
2695 		if (empty_bucket(hinfo, st))
2696 			continue;
2697 
2698 		spin_lock_bh(lock);
2699 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2700 			if (seq_sk_match(seq, sk))
2701 				return sk;
2702 		}
2703 		spin_unlock_bh(lock);
2704 	}
2705 
2706 	return NULL;
2707 }
2708 
2709 static void *established_get_next(struct seq_file *seq, void *cur)
2710 {
2711 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2712 	struct tcp_iter_state *st = seq->private;
2713 	struct hlist_nulls_node *node;
2714 	struct sock *sk = cur;
2715 
2716 	++st->num;
2717 	++st->offset;
2718 
2719 	sk = sk_nulls_next(sk);
2720 
2721 	sk_nulls_for_each_from(sk, node) {
2722 		if (seq_sk_match(seq, sk))
2723 			return sk;
2724 	}
2725 
2726 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2727 	++st->bucket;
2728 	return established_get_first(seq);
2729 }
2730 
2731 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2732 {
2733 	struct tcp_iter_state *st = seq->private;
2734 	void *rc;
2735 
2736 	st->bucket = 0;
2737 	rc = established_get_first(seq);
2738 
2739 	while (rc && pos) {
2740 		rc = established_get_next(seq, rc);
2741 		--pos;
2742 	}
2743 	return rc;
2744 }
2745 
2746 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2747 {
2748 	void *rc;
2749 	struct tcp_iter_state *st = seq->private;
2750 
2751 	st->state = TCP_SEQ_STATE_LISTENING;
2752 	rc	  = listening_get_idx(seq, &pos);
2753 
2754 	if (!rc) {
2755 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2756 		rc	  = established_get_idx(seq, pos);
2757 	}
2758 
2759 	return rc;
2760 }
2761 
2762 static void *tcp_seek_last_pos(struct seq_file *seq)
2763 {
2764 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2765 	struct tcp_iter_state *st = seq->private;
2766 	int bucket = st->bucket;
2767 	int offset = st->offset;
2768 	int orig_num = st->num;
2769 	void *rc = NULL;
2770 
2771 	switch (st->state) {
2772 	case TCP_SEQ_STATE_LISTENING:
2773 		if (st->bucket > hinfo->lhash2_mask)
2774 			break;
2775 		rc = listening_get_first(seq);
2776 		while (offset-- && rc && bucket == st->bucket)
2777 			rc = listening_get_next(seq, rc);
2778 		if (rc)
2779 			break;
2780 		st->bucket = 0;
2781 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2782 		fallthrough;
2783 	case TCP_SEQ_STATE_ESTABLISHED:
2784 		if (st->bucket > hinfo->ehash_mask)
2785 			break;
2786 		rc = established_get_first(seq);
2787 		while (offset-- && rc && bucket == st->bucket)
2788 			rc = established_get_next(seq, rc);
2789 	}
2790 
2791 	st->num = orig_num;
2792 
2793 	return rc;
2794 }
2795 
2796 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2797 {
2798 	struct tcp_iter_state *st = seq->private;
2799 	void *rc;
2800 
2801 	if (*pos && *pos == st->last_pos) {
2802 		rc = tcp_seek_last_pos(seq);
2803 		if (rc)
2804 			goto out;
2805 	}
2806 
2807 	st->state = TCP_SEQ_STATE_LISTENING;
2808 	st->num = 0;
2809 	st->bucket = 0;
2810 	st->offset = 0;
2811 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2812 
2813 out:
2814 	st->last_pos = *pos;
2815 	return rc;
2816 }
2817 EXPORT_SYMBOL(tcp_seq_start);
2818 
2819 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2820 {
2821 	struct tcp_iter_state *st = seq->private;
2822 	void *rc = NULL;
2823 
2824 	if (v == SEQ_START_TOKEN) {
2825 		rc = tcp_get_idx(seq, 0);
2826 		goto out;
2827 	}
2828 
2829 	switch (st->state) {
2830 	case TCP_SEQ_STATE_LISTENING:
2831 		rc = listening_get_next(seq, v);
2832 		if (!rc) {
2833 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2834 			st->bucket = 0;
2835 			st->offset = 0;
2836 			rc	  = established_get_first(seq);
2837 		}
2838 		break;
2839 	case TCP_SEQ_STATE_ESTABLISHED:
2840 		rc = established_get_next(seq, v);
2841 		break;
2842 	}
2843 out:
2844 	++*pos;
2845 	st->last_pos = *pos;
2846 	return rc;
2847 }
2848 EXPORT_SYMBOL(tcp_seq_next);
2849 
2850 void tcp_seq_stop(struct seq_file *seq, void *v)
2851 {
2852 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2853 	struct tcp_iter_state *st = seq->private;
2854 
2855 	switch (st->state) {
2856 	case TCP_SEQ_STATE_LISTENING:
2857 		if (v != SEQ_START_TOKEN)
2858 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2859 		break;
2860 	case TCP_SEQ_STATE_ESTABLISHED:
2861 		if (v)
2862 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2863 		break;
2864 	}
2865 }
2866 EXPORT_SYMBOL(tcp_seq_stop);
2867 
2868 static void get_openreq4(const struct request_sock *req,
2869 			 struct seq_file *f, int i)
2870 {
2871 	const struct inet_request_sock *ireq = inet_rsk(req);
2872 	long delta = req->rsk_timer.expires - jiffies;
2873 
2874 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2875 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2876 		i,
2877 		ireq->ir_loc_addr,
2878 		ireq->ir_num,
2879 		ireq->ir_rmt_addr,
2880 		ntohs(ireq->ir_rmt_port),
2881 		TCP_SYN_RECV,
2882 		0, 0, /* could print option size, but that is af dependent. */
2883 		1,    /* timers active (only the expire timer) */
2884 		jiffies_delta_to_clock_t(delta),
2885 		req->num_timeout,
2886 		from_kuid_munged(seq_user_ns(f),
2887 				 sock_i_uid(req->rsk_listener)),
2888 		0,  /* non standard timer */
2889 		0, /* open_requests have no inode */
2890 		0,
2891 		req);
2892 }
2893 
2894 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2895 {
2896 	int timer_active;
2897 	unsigned long timer_expires;
2898 	const struct tcp_sock *tp = tcp_sk(sk);
2899 	const struct inet_connection_sock *icsk = inet_csk(sk);
2900 	const struct inet_sock *inet = inet_sk(sk);
2901 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2902 	__be32 dest = inet->inet_daddr;
2903 	__be32 src = inet->inet_rcv_saddr;
2904 	__u16 destp = ntohs(inet->inet_dport);
2905 	__u16 srcp = ntohs(inet->inet_sport);
2906 	u8 icsk_pending;
2907 	int rx_queue;
2908 	int state;
2909 
2910 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2911 	if (icsk_pending == ICSK_TIME_RETRANS ||
2912 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2913 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2914 		timer_active	= 1;
2915 		timer_expires	= icsk->icsk_timeout;
2916 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2917 		timer_active	= 4;
2918 		timer_expires	= icsk->icsk_timeout;
2919 	} else if (timer_pending(&sk->sk_timer)) {
2920 		timer_active	= 2;
2921 		timer_expires	= sk->sk_timer.expires;
2922 	} else {
2923 		timer_active	= 0;
2924 		timer_expires = jiffies;
2925 	}
2926 
2927 	state = inet_sk_state_load(sk);
2928 	if (state == TCP_LISTEN)
2929 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2930 	else
2931 		/* Because we don't lock the socket,
2932 		 * we might find a transient negative value.
2933 		 */
2934 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2935 				      READ_ONCE(tp->copied_seq), 0);
2936 
2937 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2938 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2939 		i, src, srcp, dest, destp, state,
2940 		READ_ONCE(tp->write_seq) - tp->snd_una,
2941 		rx_queue,
2942 		timer_active,
2943 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2944 		icsk->icsk_retransmits,
2945 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2946 		icsk->icsk_probes_out,
2947 		sock_i_ino(sk),
2948 		refcount_read(&sk->sk_refcnt), sk,
2949 		jiffies_to_clock_t(icsk->icsk_rto),
2950 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2951 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2952 		tcp_snd_cwnd(tp),
2953 		state == TCP_LISTEN ?
2954 		    fastopenq->max_qlen :
2955 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2956 }
2957 
2958 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2959 			       struct seq_file *f, int i)
2960 {
2961 	long delta = tw->tw_timer.expires - jiffies;
2962 	__be32 dest, src;
2963 	__u16 destp, srcp;
2964 
2965 	dest  = tw->tw_daddr;
2966 	src   = tw->tw_rcv_saddr;
2967 	destp = ntohs(tw->tw_dport);
2968 	srcp  = ntohs(tw->tw_sport);
2969 
2970 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2971 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2972 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2973 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2974 		refcount_read(&tw->tw_refcnt), tw);
2975 }
2976 
2977 #define TMPSZ 150
2978 
2979 static int tcp4_seq_show(struct seq_file *seq, void *v)
2980 {
2981 	struct tcp_iter_state *st;
2982 	struct sock *sk = v;
2983 
2984 	seq_setwidth(seq, TMPSZ - 1);
2985 	if (v == SEQ_START_TOKEN) {
2986 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2987 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2988 			   "inode");
2989 		goto out;
2990 	}
2991 	st = seq->private;
2992 
2993 	if (sk->sk_state == TCP_TIME_WAIT)
2994 		get_timewait4_sock(v, seq, st->num);
2995 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2996 		get_openreq4(v, seq, st->num);
2997 	else
2998 		get_tcp4_sock(v, seq, st->num);
2999 out:
3000 	seq_pad(seq, '\n');
3001 	return 0;
3002 }
3003 
3004 #ifdef CONFIG_BPF_SYSCALL
3005 struct bpf_tcp_iter_state {
3006 	struct tcp_iter_state state;
3007 	unsigned int cur_sk;
3008 	unsigned int end_sk;
3009 	unsigned int max_sk;
3010 	struct sock **batch;
3011 	bool st_bucket_done;
3012 };
3013 
3014 struct bpf_iter__tcp {
3015 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3016 	__bpf_md_ptr(struct sock_common *, sk_common);
3017 	uid_t uid __aligned(8);
3018 };
3019 
3020 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3021 			     struct sock_common *sk_common, uid_t uid)
3022 {
3023 	struct bpf_iter__tcp ctx;
3024 
3025 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3026 	ctx.meta = meta;
3027 	ctx.sk_common = sk_common;
3028 	ctx.uid = uid;
3029 	return bpf_iter_run_prog(prog, &ctx);
3030 }
3031 
3032 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3033 {
3034 	while (iter->cur_sk < iter->end_sk)
3035 		sock_gen_put(iter->batch[iter->cur_sk++]);
3036 }
3037 
3038 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3039 				      unsigned int new_batch_sz)
3040 {
3041 	struct sock **new_batch;
3042 
3043 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3044 			     GFP_USER | __GFP_NOWARN);
3045 	if (!new_batch)
3046 		return -ENOMEM;
3047 
3048 	bpf_iter_tcp_put_batch(iter);
3049 	kvfree(iter->batch);
3050 	iter->batch = new_batch;
3051 	iter->max_sk = new_batch_sz;
3052 
3053 	return 0;
3054 }
3055 
3056 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3057 						 struct sock *start_sk)
3058 {
3059 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3060 	struct bpf_tcp_iter_state *iter = seq->private;
3061 	struct tcp_iter_state *st = &iter->state;
3062 	struct hlist_nulls_node *node;
3063 	unsigned int expected = 1;
3064 	struct sock *sk;
3065 
3066 	sock_hold(start_sk);
3067 	iter->batch[iter->end_sk++] = start_sk;
3068 
3069 	sk = sk_nulls_next(start_sk);
3070 	sk_nulls_for_each_from(sk, node) {
3071 		if (seq_sk_match(seq, sk)) {
3072 			if (iter->end_sk < iter->max_sk) {
3073 				sock_hold(sk);
3074 				iter->batch[iter->end_sk++] = sk;
3075 			}
3076 			expected++;
3077 		}
3078 	}
3079 	spin_unlock(&hinfo->lhash2[st->bucket].lock);
3080 
3081 	return expected;
3082 }
3083 
3084 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3085 						   struct sock *start_sk)
3086 {
3087 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3088 	struct bpf_tcp_iter_state *iter = seq->private;
3089 	struct tcp_iter_state *st = &iter->state;
3090 	struct hlist_nulls_node *node;
3091 	unsigned int expected = 1;
3092 	struct sock *sk;
3093 
3094 	sock_hold(start_sk);
3095 	iter->batch[iter->end_sk++] = start_sk;
3096 
3097 	sk = sk_nulls_next(start_sk);
3098 	sk_nulls_for_each_from(sk, node) {
3099 		if (seq_sk_match(seq, sk)) {
3100 			if (iter->end_sk < iter->max_sk) {
3101 				sock_hold(sk);
3102 				iter->batch[iter->end_sk++] = sk;
3103 			}
3104 			expected++;
3105 		}
3106 	}
3107 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3108 
3109 	return expected;
3110 }
3111 
3112 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3113 {
3114 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3115 	struct bpf_tcp_iter_state *iter = seq->private;
3116 	struct tcp_iter_state *st = &iter->state;
3117 	unsigned int expected;
3118 	bool resized = false;
3119 	struct sock *sk;
3120 
3121 	/* The st->bucket is done.  Directly advance to the next
3122 	 * bucket instead of having the tcp_seek_last_pos() to skip
3123 	 * one by one in the current bucket and eventually find out
3124 	 * it has to advance to the next bucket.
3125 	 */
3126 	if (iter->st_bucket_done) {
3127 		st->offset = 0;
3128 		st->bucket++;
3129 		if (st->state == TCP_SEQ_STATE_LISTENING &&
3130 		    st->bucket > hinfo->lhash2_mask) {
3131 			st->state = TCP_SEQ_STATE_ESTABLISHED;
3132 			st->bucket = 0;
3133 		}
3134 	}
3135 
3136 again:
3137 	/* Get a new batch */
3138 	iter->cur_sk = 0;
3139 	iter->end_sk = 0;
3140 	iter->st_bucket_done = false;
3141 
3142 	sk = tcp_seek_last_pos(seq);
3143 	if (!sk)
3144 		return NULL; /* Done */
3145 
3146 	if (st->state == TCP_SEQ_STATE_LISTENING)
3147 		expected = bpf_iter_tcp_listening_batch(seq, sk);
3148 	else
3149 		expected = bpf_iter_tcp_established_batch(seq, sk);
3150 
3151 	if (iter->end_sk == expected) {
3152 		iter->st_bucket_done = true;
3153 		return sk;
3154 	}
3155 
3156 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3157 		resized = true;
3158 		goto again;
3159 	}
3160 
3161 	return sk;
3162 }
3163 
3164 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3165 {
3166 	/* bpf iter does not support lseek, so it always
3167 	 * continue from where it was stop()-ped.
3168 	 */
3169 	if (*pos)
3170 		return bpf_iter_tcp_batch(seq);
3171 
3172 	return SEQ_START_TOKEN;
3173 }
3174 
3175 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3176 {
3177 	struct bpf_tcp_iter_state *iter = seq->private;
3178 	struct tcp_iter_state *st = &iter->state;
3179 	struct sock *sk;
3180 
3181 	/* Whenever seq_next() is called, the iter->cur_sk is
3182 	 * done with seq_show(), so advance to the next sk in
3183 	 * the batch.
3184 	 */
3185 	if (iter->cur_sk < iter->end_sk) {
3186 		/* Keeping st->num consistent in tcp_iter_state.
3187 		 * bpf_iter_tcp does not use st->num.
3188 		 * meta.seq_num is used instead.
3189 		 */
3190 		st->num++;
3191 		/* Move st->offset to the next sk in the bucket such that
3192 		 * the future start() will resume at st->offset in
3193 		 * st->bucket.  See tcp_seek_last_pos().
3194 		 */
3195 		st->offset++;
3196 		sock_gen_put(iter->batch[iter->cur_sk++]);
3197 	}
3198 
3199 	if (iter->cur_sk < iter->end_sk)
3200 		sk = iter->batch[iter->cur_sk];
3201 	else
3202 		sk = bpf_iter_tcp_batch(seq);
3203 
3204 	++*pos;
3205 	/* Keeping st->last_pos consistent in tcp_iter_state.
3206 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3207 	 */
3208 	st->last_pos = *pos;
3209 	return sk;
3210 }
3211 
3212 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3213 {
3214 	struct bpf_iter_meta meta;
3215 	struct bpf_prog *prog;
3216 	struct sock *sk = v;
3217 	uid_t uid;
3218 	int ret;
3219 
3220 	if (v == SEQ_START_TOKEN)
3221 		return 0;
3222 
3223 	if (sk_fullsock(sk))
3224 		lock_sock(sk);
3225 
3226 	if (unlikely(sk_unhashed(sk))) {
3227 		ret = SEQ_SKIP;
3228 		goto unlock;
3229 	}
3230 
3231 	if (sk->sk_state == TCP_TIME_WAIT) {
3232 		uid = 0;
3233 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3234 		const struct request_sock *req = v;
3235 
3236 		uid = from_kuid_munged(seq_user_ns(seq),
3237 				       sock_i_uid(req->rsk_listener));
3238 	} else {
3239 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3240 	}
3241 
3242 	meta.seq = seq;
3243 	prog = bpf_iter_get_info(&meta, false);
3244 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3245 
3246 unlock:
3247 	if (sk_fullsock(sk))
3248 		release_sock(sk);
3249 	return ret;
3250 
3251 }
3252 
3253 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3254 {
3255 	struct bpf_tcp_iter_state *iter = seq->private;
3256 	struct bpf_iter_meta meta;
3257 	struct bpf_prog *prog;
3258 
3259 	if (!v) {
3260 		meta.seq = seq;
3261 		prog = bpf_iter_get_info(&meta, true);
3262 		if (prog)
3263 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3264 	}
3265 
3266 	if (iter->cur_sk < iter->end_sk) {
3267 		bpf_iter_tcp_put_batch(iter);
3268 		iter->st_bucket_done = false;
3269 	}
3270 }
3271 
3272 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3273 	.show		= bpf_iter_tcp_seq_show,
3274 	.start		= bpf_iter_tcp_seq_start,
3275 	.next		= bpf_iter_tcp_seq_next,
3276 	.stop		= bpf_iter_tcp_seq_stop,
3277 };
3278 #endif
3279 static unsigned short seq_file_family(const struct seq_file *seq)
3280 {
3281 	const struct tcp_seq_afinfo *afinfo;
3282 
3283 #ifdef CONFIG_BPF_SYSCALL
3284 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3285 	if (seq->op == &bpf_iter_tcp_seq_ops)
3286 		return AF_UNSPEC;
3287 #endif
3288 
3289 	/* Iterated from proc fs */
3290 	afinfo = pde_data(file_inode(seq->file));
3291 	return afinfo->family;
3292 }
3293 
3294 static const struct seq_operations tcp4_seq_ops = {
3295 	.show		= tcp4_seq_show,
3296 	.start		= tcp_seq_start,
3297 	.next		= tcp_seq_next,
3298 	.stop		= tcp_seq_stop,
3299 };
3300 
3301 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3302 	.family		= AF_INET,
3303 };
3304 
3305 static int __net_init tcp4_proc_init_net(struct net *net)
3306 {
3307 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3308 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3309 		return -ENOMEM;
3310 	return 0;
3311 }
3312 
3313 static void __net_exit tcp4_proc_exit_net(struct net *net)
3314 {
3315 	remove_proc_entry("tcp", net->proc_net);
3316 }
3317 
3318 static struct pernet_operations tcp4_net_ops = {
3319 	.init = tcp4_proc_init_net,
3320 	.exit = tcp4_proc_exit_net,
3321 };
3322 
3323 int __init tcp4_proc_init(void)
3324 {
3325 	return register_pernet_subsys(&tcp4_net_ops);
3326 }
3327 
3328 void tcp4_proc_exit(void)
3329 {
3330 	unregister_pernet_subsys(&tcp4_net_ops);
3331 }
3332 #endif /* CONFIG_PROC_FS */
3333 
3334 /* @wake is one when sk_stream_write_space() calls us.
3335  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3336  * This mimics the strategy used in sock_def_write_space().
3337  */
3338 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3339 {
3340 	const struct tcp_sock *tp = tcp_sk(sk);
3341 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3342 			    READ_ONCE(tp->snd_nxt);
3343 
3344 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3345 }
3346 EXPORT_SYMBOL(tcp_stream_memory_free);
3347 
3348 struct proto tcp_prot = {
3349 	.name			= "TCP",
3350 	.owner			= THIS_MODULE,
3351 	.close			= tcp_close,
3352 	.pre_connect		= tcp_v4_pre_connect,
3353 	.connect		= tcp_v4_connect,
3354 	.disconnect		= tcp_disconnect,
3355 	.accept			= inet_csk_accept,
3356 	.ioctl			= tcp_ioctl,
3357 	.init			= tcp_v4_init_sock,
3358 	.destroy		= tcp_v4_destroy_sock,
3359 	.shutdown		= tcp_shutdown,
3360 	.setsockopt		= tcp_setsockopt,
3361 	.getsockopt		= tcp_getsockopt,
3362 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3363 	.keepalive		= tcp_set_keepalive,
3364 	.recvmsg		= tcp_recvmsg,
3365 	.sendmsg		= tcp_sendmsg,
3366 	.splice_eof		= tcp_splice_eof,
3367 	.backlog_rcv		= tcp_v4_do_rcv,
3368 	.release_cb		= tcp_release_cb,
3369 	.hash			= inet_hash,
3370 	.unhash			= inet_unhash,
3371 	.get_port		= inet_csk_get_port,
3372 	.put_port		= inet_put_port,
3373 #ifdef CONFIG_BPF_SYSCALL
3374 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3375 #endif
3376 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3377 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3378 	.stream_memory_free	= tcp_stream_memory_free,
3379 	.sockets_allocated	= &tcp_sockets_allocated,
3380 	.orphan_count		= &tcp_orphan_count,
3381 
3382 	.memory_allocated	= &tcp_memory_allocated,
3383 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3384 
3385 	.memory_pressure	= &tcp_memory_pressure,
3386 	.sysctl_mem		= sysctl_tcp_mem,
3387 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3388 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3389 	.max_header		= MAX_TCP_HEADER,
3390 	.obj_size		= sizeof(struct tcp_sock),
3391 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3392 	.twsk_prot		= &tcp_timewait_sock_ops,
3393 	.rsk_prot		= &tcp_request_sock_ops,
3394 	.h.hashinfo		= NULL,
3395 	.no_autobind		= true,
3396 	.diag_destroy		= tcp_abort,
3397 };
3398 EXPORT_SYMBOL(tcp_prot);
3399 
3400 static void __net_exit tcp_sk_exit(struct net *net)
3401 {
3402 	if (net->ipv4.tcp_congestion_control)
3403 		bpf_module_put(net->ipv4.tcp_congestion_control,
3404 			       net->ipv4.tcp_congestion_control->owner);
3405 }
3406 
3407 static void __net_init tcp_set_hashinfo(struct net *net)
3408 {
3409 	struct inet_hashinfo *hinfo;
3410 	unsigned int ehash_entries;
3411 	struct net *old_net;
3412 
3413 	if (net_eq(net, &init_net))
3414 		goto fallback;
3415 
3416 	old_net = current->nsproxy->net_ns;
3417 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3418 	if (!ehash_entries)
3419 		goto fallback;
3420 
3421 	ehash_entries = roundup_pow_of_two(ehash_entries);
3422 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3423 	if (!hinfo) {
3424 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3425 			"for a netns, fallback to the global one\n",
3426 			ehash_entries);
3427 fallback:
3428 		hinfo = &tcp_hashinfo;
3429 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3430 	}
3431 
3432 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3433 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3434 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3435 }
3436 
3437 static int __net_init tcp_sk_init(struct net *net)
3438 {
3439 	net->ipv4.sysctl_tcp_ecn = 2;
3440 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3441 
3442 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3443 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3444 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3445 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3446 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3447 
3448 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3449 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3450 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3451 
3452 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3453 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3454 	net->ipv4.sysctl_tcp_syncookies = 1;
3455 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3456 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3457 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3458 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3459 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3460 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3461 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3462 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3463 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3464 
3465 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3466 	tcp_set_hashinfo(net);
3467 
3468 	net->ipv4.sysctl_tcp_sack = 1;
3469 	net->ipv4.sysctl_tcp_window_scaling = 1;
3470 	net->ipv4.sysctl_tcp_timestamps = 1;
3471 	net->ipv4.sysctl_tcp_early_retrans = 3;
3472 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3473 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3474 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3475 	net->ipv4.sysctl_tcp_max_reordering = 300;
3476 	net->ipv4.sysctl_tcp_dsack = 1;
3477 	net->ipv4.sysctl_tcp_app_win = 31;
3478 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3479 	net->ipv4.sysctl_tcp_frto = 2;
3480 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3481 	/* This limits the percentage of the congestion window which we
3482 	 * will allow a single TSO frame to consume.  Building TSO frames
3483 	 * which are too large can cause TCP streams to be bursty.
3484 	 */
3485 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3486 	/* Default TSQ limit of 16 TSO segments */
3487 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3488 
3489 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3490 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3491 
3492 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3493 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3494 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3495 	net->ipv4.sysctl_tcp_autocorking = 1;
3496 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3497 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3498 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3499 	if (net != &init_net) {
3500 		memcpy(net->ipv4.sysctl_tcp_rmem,
3501 		       init_net.ipv4.sysctl_tcp_rmem,
3502 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3503 		memcpy(net->ipv4.sysctl_tcp_wmem,
3504 		       init_net.ipv4.sysctl_tcp_wmem,
3505 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3506 	}
3507 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3508 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3509 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3510 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3511 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3512 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3513 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3514 
3515 	/* Set default values for PLB */
3516 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3517 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3518 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3519 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3520 	/* Default congestion threshold for PLB to mark a round is 50% */
3521 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3522 
3523 	/* Reno is always built in */
3524 	if (!net_eq(net, &init_net) &&
3525 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3526 			       init_net.ipv4.tcp_congestion_control->owner))
3527 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3528 	else
3529 		net->ipv4.tcp_congestion_control = &tcp_reno;
3530 
3531 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3532 	net->ipv4.sysctl_tcp_shrink_window = 0;
3533 
3534 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3535 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3536 
3537 	return 0;
3538 }
3539 
3540 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3541 {
3542 	struct net *net;
3543 
3544 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3545 	 * and failed setup_net error unwinding path are serialized.
3546 	 *
3547 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3548 	 * net_exit_list, the thread that dismantles a particular twsk must
3549 	 * do so without other thread progressing to refcount_dec_and_test() of
3550 	 * tcp_death_row.tw_refcount.
3551 	 */
3552 	mutex_lock(&tcp_exit_batch_mutex);
3553 
3554 	tcp_twsk_purge(net_exit_list);
3555 
3556 	list_for_each_entry(net, net_exit_list, exit_list) {
3557 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3558 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3559 		tcp_fastopen_ctx_destroy(net);
3560 	}
3561 
3562 	mutex_unlock(&tcp_exit_batch_mutex);
3563 }
3564 
3565 static struct pernet_operations __net_initdata tcp_sk_ops = {
3566        .init	   = tcp_sk_init,
3567        .exit	   = tcp_sk_exit,
3568        .exit_batch = tcp_sk_exit_batch,
3569 };
3570 
3571 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3572 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3573 		     struct sock_common *sk_common, uid_t uid)
3574 
3575 #define INIT_BATCH_SZ 16
3576 
3577 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3578 {
3579 	struct bpf_tcp_iter_state *iter = priv_data;
3580 	int err;
3581 
3582 	err = bpf_iter_init_seq_net(priv_data, aux);
3583 	if (err)
3584 		return err;
3585 
3586 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3587 	if (err) {
3588 		bpf_iter_fini_seq_net(priv_data);
3589 		return err;
3590 	}
3591 
3592 	return 0;
3593 }
3594 
3595 static void bpf_iter_fini_tcp(void *priv_data)
3596 {
3597 	struct bpf_tcp_iter_state *iter = priv_data;
3598 
3599 	bpf_iter_fini_seq_net(priv_data);
3600 	kvfree(iter->batch);
3601 }
3602 
3603 static const struct bpf_iter_seq_info tcp_seq_info = {
3604 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3605 	.init_seq_private	= bpf_iter_init_tcp,
3606 	.fini_seq_private	= bpf_iter_fini_tcp,
3607 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3608 };
3609 
3610 static const struct bpf_func_proto *
3611 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3612 			    const struct bpf_prog *prog)
3613 {
3614 	switch (func_id) {
3615 	case BPF_FUNC_setsockopt:
3616 		return &bpf_sk_setsockopt_proto;
3617 	case BPF_FUNC_getsockopt:
3618 		return &bpf_sk_getsockopt_proto;
3619 	default:
3620 		return NULL;
3621 	}
3622 }
3623 
3624 static struct bpf_iter_reg tcp_reg_info = {
3625 	.target			= "tcp",
3626 	.ctx_arg_info_size	= 1,
3627 	.ctx_arg_info		= {
3628 		{ offsetof(struct bpf_iter__tcp, sk_common),
3629 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3630 	},
3631 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3632 	.seq_info		= &tcp_seq_info,
3633 };
3634 
3635 static void __init bpf_iter_register(void)
3636 {
3637 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3638 	if (bpf_iter_reg_target(&tcp_reg_info))
3639 		pr_warn("Warning: could not register bpf iterator tcp\n");
3640 }
3641 
3642 #endif
3643 
3644 void __init tcp_v4_init(void)
3645 {
3646 	int cpu, res;
3647 
3648 	for_each_possible_cpu(cpu) {
3649 		struct sock *sk;
3650 
3651 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3652 					   IPPROTO_TCP, &init_net);
3653 		if (res)
3654 			panic("Failed to create the TCP control socket.\n");
3655 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3656 
3657 		/* Please enforce IP_DF and IPID==0 for RST and
3658 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3659 		 */
3660 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3661 
3662 		sk->sk_clockid = CLOCK_MONOTONIC;
3663 
3664 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3665 	}
3666 	if (register_pernet_subsys(&tcp_sk_ops))
3667 		panic("Failed to create the TCP control socket.\n");
3668 
3669 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3670 	bpf_iter_register();
3671 #endif
3672 }
3673