xref: /linux/net/ipv4/tcp_ipv4.c (revision 8be4d31cb8aaeea27bde4b7ddb26e28a89062ebf)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 #include <linux/sock_diag.h>
62 
63 #include <net/aligned_data.h>
64 #include <net/net_namespace.h>
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/inet_ecn.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/secure_seq.h>
75 #include <net/busy_poll.h>
76 #include <net/rstreason.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 #include <linux/btf_ids.h>
85 #include <linux/skbuff_ref.h>
86 
87 #include <crypto/hash.h>
88 #include <linux/scatterlist.h>
89 
90 #include <trace/events/tcp.h>
91 
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96 
97 struct inet_hashinfo tcp_hashinfo;
98 
99 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
100 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
101 };
102 
103 static DEFINE_MUTEX(tcp_exit_batch_mutex);
104 
tcp_v4_init_seq(const struct sk_buff * skb)105 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
106 {
107 	return secure_tcp_seq(ip_hdr(skb)->daddr,
108 			      ip_hdr(skb)->saddr,
109 			      tcp_hdr(skb)->dest,
110 			      tcp_hdr(skb)->source);
111 }
112 
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)113 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
114 {
115 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
116 }
117 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
119 {
120 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
121 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
122 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
123 	struct tcp_sock *tp = tcp_sk(sk);
124 	int ts_recent_stamp;
125 	u32 reuse_thresh;
126 
127 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
128 		reuse = 0;
129 
130 	if (reuse == 2) {
131 		/* Still does not detect *everything* that goes through
132 		 * lo, since we require a loopback src or dst address
133 		 * or direct binding to 'lo' interface.
134 		 */
135 		bool loopback = false;
136 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
137 			loopback = true;
138 #if IS_ENABLED(CONFIG_IPV6)
139 		if (tw->tw_family == AF_INET6) {
140 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
141 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
142 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
143 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
144 				loopback = true;
145 		} else
146 #endif
147 		{
148 			if (ipv4_is_loopback(tw->tw_daddr) ||
149 			    ipv4_is_loopback(tw->tw_rcv_saddr))
150 				loopback = true;
151 		}
152 		if (!loopback)
153 			reuse = 0;
154 	}
155 
156 	/* With PAWS, it is safe from the viewpoint
157 	   of data integrity. Even without PAWS it is safe provided sequence
158 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
159 
160 	   Actually, the idea is close to VJ's one, only timestamp cache is
161 	   held not per host, but per port pair and TW bucket is used as state
162 	   holder.
163 
164 	   If TW bucket has been already destroyed we fall back to VJ's scheme
165 	   and use initial timestamp retrieved from peer table.
166 	 */
167 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
168 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
169 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
170 	if (ts_recent_stamp &&
171 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
172 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
173 		 * and releasing the bucket lock.
174 		 */
175 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
176 			return 0;
177 
178 		/* In case of repair and re-using TIME-WAIT sockets we still
179 		 * want to be sure that it is safe as above but honor the
180 		 * sequence numbers and time stamps set as part of the repair
181 		 * process.
182 		 *
183 		 * Without this check re-using a TIME-WAIT socket with TCP
184 		 * repair would accumulate a -1 on the repair assigned
185 		 * sequence number. The first time it is reused the sequence
186 		 * is -1, the second time -2, etc. This fixes that issue
187 		 * without appearing to create any others.
188 		 */
189 		if (likely(!tp->repair)) {
190 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
191 
192 			if (!seq)
193 				seq = 1;
194 			WRITE_ONCE(tp->write_seq, seq);
195 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
196 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
197 		}
198 
199 		return 1;
200 	}
201 
202 	return 0;
203 }
204 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
205 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)206 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
207 			      int addr_len)
208 {
209 	/* This check is replicated from tcp_v4_connect() and intended to
210 	 * prevent BPF program called below from accessing bytes that are out
211 	 * of the bound specified by user in addr_len.
212 	 */
213 	if (addr_len < sizeof(struct sockaddr_in))
214 		return -EINVAL;
215 
216 	sock_owned_by_me(sk);
217 
218 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
219 }
220 
221 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)222 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
223 {
224 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
225 	struct inet_timewait_death_row *tcp_death_row;
226 	struct inet_sock *inet = inet_sk(sk);
227 	struct tcp_sock *tp = tcp_sk(sk);
228 	struct ip_options_rcu *inet_opt;
229 	struct net *net = sock_net(sk);
230 	__be16 orig_sport, orig_dport;
231 	__be32 daddr, nexthop;
232 	struct flowi4 *fl4;
233 	struct rtable *rt;
234 	int err;
235 
236 	if (addr_len < sizeof(struct sockaddr_in))
237 		return -EINVAL;
238 
239 	if (usin->sin_family != AF_INET)
240 		return -EAFNOSUPPORT;
241 
242 	nexthop = daddr = usin->sin_addr.s_addr;
243 	inet_opt = rcu_dereference_protected(inet->inet_opt,
244 					     lockdep_sock_is_held(sk));
245 	if (inet_opt && inet_opt->opt.srr) {
246 		if (!daddr)
247 			return -EINVAL;
248 		nexthop = inet_opt->opt.faddr;
249 	}
250 
251 	orig_sport = inet->inet_sport;
252 	orig_dport = usin->sin_port;
253 	fl4 = &inet->cork.fl.u.ip4;
254 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
255 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
256 			      orig_dport, sk);
257 	if (IS_ERR(rt)) {
258 		err = PTR_ERR(rt);
259 		if (err == -ENETUNREACH)
260 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
261 		return err;
262 	}
263 
264 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
265 		ip_rt_put(rt);
266 		return -ENETUNREACH;
267 	}
268 
269 	if (!inet_opt || !inet_opt->opt.srr)
270 		daddr = fl4->daddr;
271 
272 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
273 
274 	if (!inet->inet_saddr) {
275 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
276 		if (err) {
277 			ip_rt_put(rt);
278 			return err;
279 		}
280 	} else {
281 		sk_rcv_saddr_set(sk, inet->inet_saddr);
282 	}
283 
284 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
285 		/* Reset inherited state */
286 		tp->rx_opt.ts_recent	   = 0;
287 		tp->rx_opt.ts_recent_stamp = 0;
288 		if (likely(!tp->repair))
289 			WRITE_ONCE(tp->write_seq, 0);
290 	}
291 
292 	inet->inet_dport = usin->sin_port;
293 	sk_daddr_set(sk, daddr);
294 
295 	inet_csk(sk)->icsk_ext_hdr_len = 0;
296 	if (inet_opt)
297 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
298 
299 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
300 
301 	/* Socket identity is still unknown (sport may be zero).
302 	 * However we set state to SYN-SENT and not releasing socket
303 	 * lock select source port, enter ourselves into the hash tables and
304 	 * complete initialization after this.
305 	 */
306 	tcp_set_state(sk, TCP_SYN_SENT);
307 	err = inet_hash_connect(tcp_death_row, sk);
308 	if (err)
309 		goto failure;
310 
311 	sk_set_txhash(sk);
312 
313 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
314 			       inet->inet_sport, inet->inet_dport, sk);
315 	if (IS_ERR(rt)) {
316 		err = PTR_ERR(rt);
317 		rt = NULL;
318 		goto failure;
319 	}
320 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
321 	/* OK, now commit destination to socket.  */
322 	sk->sk_gso_type = SKB_GSO_TCPV4;
323 	sk_setup_caps(sk, &rt->dst);
324 	rt = NULL;
325 
326 	if (likely(!tp->repair)) {
327 		if (!tp->write_seq)
328 			WRITE_ONCE(tp->write_seq,
329 				   secure_tcp_seq(inet->inet_saddr,
330 						  inet->inet_daddr,
331 						  inet->inet_sport,
332 						  usin->sin_port));
333 		WRITE_ONCE(tp->tsoffset,
334 			   secure_tcp_ts_off(net, inet->inet_saddr,
335 					     inet->inet_daddr));
336 	}
337 
338 	atomic_set(&inet->inet_id, get_random_u16());
339 
340 	if (tcp_fastopen_defer_connect(sk, &err))
341 		return err;
342 	if (err)
343 		goto failure;
344 
345 	err = tcp_connect(sk);
346 
347 	if (err)
348 		goto failure;
349 
350 	return 0;
351 
352 failure:
353 	/*
354 	 * This unhashes the socket and releases the local port,
355 	 * if necessary.
356 	 */
357 	tcp_set_state(sk, TCP_CLOSE);
358 	inet_bhash2_reset_saddr(sk);
359 	ip_rt_put(rt);
360 	sk->sk_route_caps = 0;
361 	inet->inet_dport = 0;
362 	return err;
363 }
364 EXPORT_IPV6_MOD(tcp_v4_connect);
365 
366 /*
367  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
368  * It can be called through tcp_release_cb() if socket was owned by user
369  * at the time tcp_v4_err() was called to handle ICMP message.
370  */
tcp_v4_mtu_reduced(struct sock * sk)371 void tcp_v4_mtu_reduced(struct sock *sk)
372 {
373 	struct inet_sock *inet = inet_sk(sk);
374 	struct dst_entry *dst;
375 	u32 mtu;
376 
377 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
378 		return;
379 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
380 	dst = inet_csk_update_pmtu(sk, mtu);
381 	if (!dst)
382 		return;
383 
384 	/* Something is about to be wrong... Remember soft error
385 	 * for the case, if this connection will not able to recover.
386 	 */
387 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
388 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
389 
390 	mtu = dst_mtu(dst);
391 
392 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
393 	    ip_sk_accept_pmtu(sk) &&
394 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
395 		tcp_sync_mss(sk, mtu);
396 
397 		/* Resend the TCP packet because it's
398 		 * clear that the old packet has been
399 		 * dropped. This is the new "fast" path mtu
400 		 * discovery.
401 		 */
402 		tcp_simple_retransmit(sk);
403 	} /* else let the usual retransmit timer handle it */
404 }
405 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
406 
do_redirect(struct sk_buff * skb,struct sock * sk)407 static void do_redirect(struct sk_buff *skb, struct sock *sk)
408 {
409 	struct dst_entry *dst = __sk_dst_check(sk, 0);
410 
411 	if (dst)
412 		dst->ops->redirect(dst, sk, skb);
413 }
414 
415 
416 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)417 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
418 {
419 	struct request_sock *req = inet_reqsk(sk);
420 	struct net *net = sock_net(sk);
421 
422 	/* ICMPs are not backlogged, hence we cannot get
423 	 * an established socket here.
424 	 */
425 	if (seq != tcp_rsk(req)->snt_isn) {
426 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
427 	} else if (abort) {
428 		/*
429 		 * Still in SYN_RECV, just remove it silently.
430 		 * There is no good way to pass the error to the newly
431 		 * created socket, and POSIX does not want network
432 		 * errors returned from accept().
433 		 */
434 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
435 		tcp_listendrop(req->rsk_listener);
436 	}
437 	reqsk_put(req);
438 }
439 EXPORT_IPV6_MOD(tcp_req_err);
440 
441 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)442 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
443 {
444 	struct inet_connection_sock *icsk = inet_csk(sk);
445 	struct tcp_sock *tp = tcp_sk(sk);
446 	struct sk_buff *skb;
447 	s32 remaining;
448 	u32 delta_us;
449 
450 	if (sock_owned_by_user(sk))
451 		return;
452 
453 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
454 	    !icsk->icsk_backoff)
455 		return;
456 
457 	skb = tcp_rtx_queue_head(sk);
458 	if (WARN_ON_ONCE(!skb))
459 		return;
460 
461 	icsk->icsk_backoff--;
462 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
463 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
464 
465 	tcp_mstamp_refresh(tp);
466 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
467 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
468 
469 	if (remaining > 0) {
470 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
471 	} else {
472 		/* RTO revert clocked out retransmission.
473 		 * Will retransmit now.
474 		 */
475 		tcp_retransmit_timer(sk);
476 	}
477 }
478 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
479 
480 /*
481  * This routine is called by the ICMP module when it gets some
482  * sort of error condition.  If err < 0 then the socket should
483  * be closed and the error returned to the user.  If err > 0
484  * it's just the icmp type << 8 | icmp code.  After adjustment
485  * header points to the first 8 bytes of the tcp header.  We need
486  * to find the appropriate port.
487  *
488  * The locking strategy used here is very "optimistic". When
489  * someone else accesses the socket the ICMP is just dropped
490  * and for some paths there is no check at all.
491  * A more general error queue to queue errors for later handling
492  * is probably better.
493  *
494  */
495 
tcp_v4_err(struct sk_buff * skb,u32 info)496 int tcp_v4_err(struct sk_buff *skb, u32 info)
497 {
498 	const struct iphdr *iph = (const struct iphdr *)skb->data;
499 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
500 	struct net *net = dev_net_rcu(skb->dev);
501 	const int type = icmp_hdr(skb)->type;
502 	const int code = icmp_hdr(skb)->code;
503 	struct request_sock *fastopen;
504 	struct tcp_sock *tp;
505 	u32 seq, snd_una;
506 	struct sock *sk;
507 	int err;
508 
509 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
510 				       iph->daddr, th->dest, iph->saddr,
511 				       ntohs(th->source), inet_iif(skb), 0);
512 	if (!sk) {
513 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
514 		return -ENOENT;
515 	}
516 	if (sk->sk_state == TCP_TIME_WAIT) {
517 		/* To increase the counter of ignored icmps for TCP-AO */
518 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
519 		inet_twsk_put(inet_twsk(sk));
520 		return 0;
521 	}
522 	seq = ntohl(th->seq);
523 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
524 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
525 				     type == ICMP_TIME_EXCEEDED ||
526 				     (type == ICMP_DEST_UNREACH &&
527 				      (code == ICMP_NET_UNREACH ||
528 				       code == ICMP_HOST_UNREACH)));
529 		return 0;
530 	}
531 
532 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
533 		sock_put(sk);
534 		return 0;
535 	}
536 
537 	bh_lock_sock(sk);
538 	/* If too many ICMPs get dropped on busy
539 	 * servers this needs to be solved differently.
540 	 * We do take care of PMTU discovery (RFC1191) special case :
541 	 * we can receive locally generated ICMP messages while socket is held.
542 	 */
543 	if (sock_owned_by_user(sk)) {
544 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
545 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
546 	}
547 	if (sk->sk_state == TCP_CLOSE)
548 		goto out;
549 
550 	if (static_branch_unlikely(&ip4_min_ttl)) {
551 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
552 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
553 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
554 			goto out;
555 		}
556 	}
557 
558 	tp = tcp_sk(sk);
559 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
560 	fastopen = rcu_dereference(tp->fastopen_rsk);
561 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
562 	if (sk->sk_state != TCP_LISTEN &&
563 	    !between(seq, snd_una, tp->snd_nxt)) {
564 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
565 		goto out;
566 	}
567 
568 	switch (type) {
569 	case ICMP_REDIRECT:
570 		if (!sock_owned_by_user(sk))
571 			do_redirect(skb, sk);
572 		goto out;
573 	case ICMP_SOURCE_QUENCH:
574 		/* Just silently ignore these. */
575 		goto out;
576 	case ICMP_PARAMETERPROB:
577 		err = EPROTO;
578 		break;
579 	case ICMP_DEST_UNREACH:
580 		if (code > NR_ICMP_UNREACH)
581 			goto out;
582 
583 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
584 			/* We are not interested in TCP_LISTEN and open_requests
585 			 * (SYN-ACKs send out by Linux are always <576bytes so
586 			 * they should go through unfragmented).
587 			 */
588 			if (sk->sk_state == TCP_LISTEN)
589 				goto out;
590 
591 			WRITE_ONCE(tp->mtu_info, info);
592 			if (!sock_owned_by_user(sk)) {
593 				tcp_v4_mtu_reduced(sk);
594 			} else {
595 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
596 					sock_hold(sk);
597 			}
598 			goto out;
599 		}
600 
601 		err = icmp_err_convert[code].errno;
602 		/* check if this ICMP message allows revert of backoff.
603 		 * (see RFC 6069)
604 		 */
605 		if (!fastopen &&
606 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
607 			tcp_ld_RTO_revert(sk, seq);
608 		break;
609 	case ICMP_TIME_EXCEEDED:
610 		err = EHOSTUNREACH;
611 		break;
612 	default:
613 		goto out;
614 	}
615 
616 	switch (sk->sk_state) {
617 	case TCP_SYN_SENT:
618 	case TCP_SYN_RECV:
619 		/* Only in fast or simultaneous open. If a fast open socket is
620 		 * already accepted it is treated as a connected one below.
621 		 */
622 		if (fastopen && !fastopen->sk)
623 			break;
624 
625 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
626 
627 		if (!sock_owned_by_user(sk))
628 			tcp_done_with_error(sk, err);
629 		else
630 			WRITE_ONCE(sk->sk_err_soft, err);
631 		goto out;
632 	}
633 
634 	/* If we've already connected we will keep trying
635 	 * until we time out, or the user gives up.
636 	 *
637 	 * rfc1122 4.2.3.9 allows to consider as hard errors
638 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
639 	 * but it is obsoleted by pmtu discovery).
640 	 *
641 	 * Note, that in modern internet, where routing is unreliable
642 	 * and in each dark corner broken firewalls sit, sending random
643 	 * errors ordered by their masters even this two messages finally lose
644 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
645 	 *
646 	 * Now we are in compliance with RFCs.
647 	 *							--ANK (980905)
648 	 */
649 
650 	if (!sock_owned_by_user(sk) &&
651 	    inet_test_bit(RECVERR, sk)) {
652 		WRITE_ONCE(sk->sk_err, err);
653 		sk_error_report(sk);
654 	} else	{ /* Only an error on timeout */
655 		WRITE_ONCE(sk->sk_err_soft, err);
656 	}
657 
658 out:
659 	bh_unlock_sock(sk);
660 	sock_put(sk);
661 	return 0;
662 }
663 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
665 {
666 	struct tcphdr *th = tcp_hdr(skb);
667 
668 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
669 	skb->csum_start = skb_transport_header(skb) - skb->head;
670 	skb->csum_offset = offsetof(struct tcphdr, check);
671 }
672 
673 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
675 {
676 	const struct inet_sock *inet = inet_sk(sk);
677 
678 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
679 }
680 EXPORT_IPV6_MOD(tcp_v4_send_check);
681 
682 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
683 
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
685 				 const struct tcp_ao_hdr *aoh,
686 				 struct ip_reply_arg *arg, struct tcphdr *reply,
687 				 __be32 reply_options[REPLY_OPTIONS_LEN])
688 {
689 #ifdef CONFIG_TCP_AO
690 	int sdif = tcp_v4_sdif(skb);
691 	int dif = inet_iif(skb);
692 	int l3index = sdif ? dif : 0;
693 	bool allocated_traffic_key;
694 	struct tcp_ao_key *key;
695 	char *traffic_key;
696 	bool drop = true;
697 	u32 ao_sne = 0;
698 	u8 keyid;
699 
700 	rcu_read_lock();
701 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
702 				 &key, &traffic_key, &allocated_traffic_key,
703 				 &keyid, &ao_sne))
704 		goto out;
705 
706 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
707 				 (aoh->rnext_keyid << 8) | keyid);
708 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
709 	reply->doff = arg->iov[0].iov_len / 4;
710 
711 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
712 			    key, traffic_key,
713 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
714 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
715 			    reply, ao_sne))
716 		goto out;
717 	drop = false;
718 out:
719 	rcu_read_unlock();
720 	if (allocated_traffic_key)
721 		kfree(traffic_key);
722 	return drop;
723 #else
724 	return true;
725 #endif
726 }
727 
728 /*
729  *	This routine will send an RST to the other tcp.
730  *
731  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
732  *		      for reset.
733  *	Answer: if a packet caused RST, it is not for a socket
734  *		existing in our system, if it is matched to a socket,
735  *		it is just duplicate segment or bug in other side's TCP.
736  *		So that we build reply only basing on parameters
737  *		arrived with segment.
738  *	Exception: precedence violation. We do not implement it in any case.
739  */
740 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
742 			      enum sk_rst_reason reason)
743 {
744 	const struct tcphdr *th = tcp_hdr(skb);
745 	struct {
746 		struct tcphdr th;
747 		__be32 opt[REPLY_OPTIONS_LEN];
748 	} rep;
749 	const __u8 *md5_hash_location = NULL;
750 	const struct tcp_ao_hdr *aoh;
751 	struct ip_reply_arg arg;
752 #ifdef CONFIG_TCP_MD5SIG
753 	struct tcp_md5sig_key *key = NULL;
754 	unsigned char newhash[16];
755 	struct sock *sk1 = NULL;
756 	int genhash;
757 #endif
758 	u64 transmit_time = 0;
759 	struct sock *ctl_sk;
760 	struct net *net;
761 	u32 txhash = 0;
762 
763 	/* Never send a reset in response to a reset. */
764 	if (th->rst)
765 		return;
766 
767 	/* If sk not NULL, it means we did a successful lookup and incoming
768 	 * route had to be correct. prequeue might have dropped our dst.
769 	 */
770 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
771 		return;
772 
773 	/* Swap the send and the receive. */
774 	memset(&rep, 0, sizeof(rep));
775 	rep.th.dest   = th->source;
776 	rep.th.source = th->dest;
777 	rep.th.doff   = sizeof(struct tcphdr) / 4;
778 	rep.th.rst    = 1;
779 
780 	if (th->ack) {
781 		rep.th.seq = th->ack_seq;
782 	} else {
783 		rep.th.ack = 1;
784 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
785 				       skb->len - (th->doff << 2));
786 	}
787 
788 	memset(&arg, 0, sizeof(arg));
789 	arg.iov[0].iov_base = (unsigned char *)&rep;
790 	arg.iov[0].iov_len  = sizeof(rep.th);
791 
792 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
793 
794 	/* Invalid TCP option size or twice included auth */
795 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
796 		return;
797 
798 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
799 		return;
800 
801 #ifdef CONFIG_TCP_MD5SIG
802 	rcu_read_lock();
803 	if (sk && sk_fullsock(sk)) {
804 		const union tcp_md5_addr *addr;
805 		int l3index;
806 
807 		/* sdif set, means packet ingressed via a device
808 		 * in an L3 domain and inet_iif is set to it.
809 		 */
810 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
811 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
812 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
813 	} else if (md5_hash_location) {
814 		const union tcp_md5_addr *addr;
815 		int sdif = tcp_v4_sdif(skb);
816 		int dif = inet_iif(skb);
817 		int l3index;
818 
819 		/*
820 		 * active side is lost. Try to find listening socket through
821 		 * source port, and then find md5 key through listening socket.
822 		 * we are not loose security here:
823 		 * Incoming packet is checked with md5 hash with finding key,
824 		 * no RST generated if md5 hash doesn't match.
825 		 */
826 		sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
827 					     NULL, 0, ip_hdr(skb)->saddr,
828 					     th->source, ip_hdr(skb)->daddr,
829 					     ntohs(th->source), dif, sdif);
830 		/* don't send rst if it can't find key */
831 		if (!sk1)
832 			goto out;
833 
834 		/* sdif set, means packet ingressed via a device
835 		 * in an L3 domain and dif is set to it.
836 		 */
837 		l3index = sdif ? dif : 0;
838 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
839 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
840 		if (!key)
841 			goto out;
842 
843 
844 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
845 		if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
846 			goto out;
847 
848 	}
849 
850 	if (key) {
851 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
852 				   (TCPOPT_NOP << 16) |
853 				   (TCPOPT_MD5SIG << 8) |
854 				   TCPOLEN_MD5SIG);
855 		/* Update length and the length the header thinks exists */
856 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
857 		rep.th.doff = arg.iov[0].iov_len / 4;
858 
859 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
860 				     key, ip_hdr(skb)->saddr,
861 				     ip_hdr(skb)->daddr, &rep.th);
862 	}
863 #endif
864 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
865 	if (rep.opt[0] == 0) {
866 		__be32 mrst = mptcp_reset_option(skb);
867 
868 		if (mrst) {
869 			rep.opt[0] = mrst;
870 			arg.iov[0].iov_len += sizeof(mrst);
871 			rep.th.doff = arg.iov[0].iov_len / 4;
872 		}
873 	}
874 
875 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
876 				      ip_hdr(skb)->saddr, /* XXX */
877 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
878 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
879 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
880 
881 	/* When socket is gone, all binding information is lost.
882 	 * routing might fail in this case. No choice here, if we choose to force
883 	 * input interface, we will misroute in case of asymmetric route.
884 	 */
885 	if (sk)
886 		arg.bound_dev_if = sk->sk_bound_dev_if;
887 
888 	trace_tcp_send_reset(sk, skb, reason);
889 
890 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
891 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
892 
893 	/* ECN bits of TW reset are cleared */
894 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
895 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
896 	local_bh_disable();
897 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
898 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
899 
900 	sock_net_set(ctl_sk, net);
901 	if (sk) {
902 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
903 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
904 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
905 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
906 		transmit_time = tcp_transmit_time(sk);
907 		xfrm_sk_clone_policy(ctl_sk, sk);
908 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
909 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
910 	} else {
911 		ctl_sk->sk_mark = 0;
912 		ctl_sk->sk_priority = 0;
913 	}
914 	ip_send_unicast_reply(ctl_sk, sk,
915 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
916 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917 			      &arg, arg.iov[0].iov_len,
918 			      transmit_time, txhash);
919 
920 	xfrm_sk_free_policy(ctl_sk);
921 	sock_net_set(ctl_sk, &init_net);
922 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
923 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
924 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
925 	local_bh_enable();
926 
927 #ifdef CONFIG_TCP_MD5SIG
928 out:
929 	rcu_read_unlock();
930 #endif
931 }
932 
933 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
934    outside socket context is ugly, certainly. What can I do?
935  */
936 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)937 static void tcp_v4_send_ack(const struct sock *sk,
938 			    struct sk_buff *skb, u32 seq, u32 ack,
939 			    u32 win, u32 tsval, u32 tsecr, int oif,
940 			    struct tcp_key *key,
941 			    int reply_flags, u8 tos, u32 txhash)
942 {
943 	const struct tcphdr *th = tcp_hdr(skb);
944 	struct {
945 		struct tcphdr th;
946 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
947 	} rep;
948 	struct net *net = sock_net(sk);
949 	struct ip_reply_arg arg;
950 	struct sock *ctl_sk;
951 	u64 transmit_time;
952 
953 	memset(&rep.th, 0, sizeof(struct tcphdr));
954 	memset(&arg, 0, sizeof(arg));
955 
956 	arg.iov[0].iov_base = (unsigned char *)&rep;
957 	arg.iov[0].iov_len  = sizeof(rep.th);
958 	if (tsecr) {
959 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
960 				   (TCPOPT_TIMESTAMP << 8) |
961 				   TCPOLEN_TIMESTAMP);
962 		rep.opt[1] = htonl(tsval);
963 		rep.opt[2] = htonl(tsecr);
964 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
965 	}
966 
967 	/* Swap the send and the receive. */
968 	rep.th.dest    = th->source;
969 	rep.th.source  = th->dest;
970 	rep.th.doff    = arg.iov[0].iov_len / 4;
971 	rep.th.seq     = htonl(seq);
972 	rep.th.ack_seq = htonl(ack);
973 	rep.th.ack     = 1;
974 	rep.th.window  = htons(win);
975 
976 #ifdef CONFIG_TCP_MD5SIG
977 	if (tcp_key_is_md5(key)) {
978 		int offset = (tsecr) ? 3 : 0;
979 
980 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
981 					  (TCPOPT_NOP << 16) |
982 					  (TCPOPT_MD5SIG << 8) |
983 					  TCPOLEN_MD5SIG);
984 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
985 		rep.th.doff = arg.iov[0].iov_len/4;
986 
987 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
988 				    key->md5_key, ip_hdr(skb)->saddr,
989 				    ip_hdr(skb)->daddr, &rep.th);
990 	}
991 #endif
992 #ifdef CONFIG_TCP_AO
993 	if (tcp_key_is_ao(key)) {
994 		int offset = (tsecr) ? 3 : 0;
995 
996 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
997 					  (tcp_ao_len(key->ao_key) << 16) |
998 					  (key->ao_key->sndid << 8) |
999 					  key->rcv_next);
1000 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
1001 		rep.th.doff = arg.iov[0].iov_len / 4;
1002 
1003 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1004 				key->ao_key, key->traffic_key,
1005 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1006 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1007 				&rep.th, key->sne);
1008 	}
1009 #endif
1010 	arg.flags = reply_flags;
1011 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1012 				      ip_hdr(skb)->saddr, /* XXX */
1013 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
1014 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1015 	if (oif)
1016 		arg.bound_dev_if = oif;
1017 	arg.tos = tos;
1018 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1019 	local_bh_disable();
1020 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1021 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1022 	sock_net_set(ctl_sk, net);
1023 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1024 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1025 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1026 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1027 	transmit_time = tcp_transmit_time(sk);
1028 	ip_send_unicast_reply(ctl_sk, sk,
1029 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1030 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1031 			      &arg, arg.iov[0].iov_len,
1032 			      transmit_time, txhash);
1033 
1034 	sock_net_set(ctl_sk, &init_net);
1035 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1036 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1037 	local_bh_enable();
1038 }
1039 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb,enum tcp_tw_status tw_status)1040 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1041 				enum tcp_tw_status tw_status)
1042 {
1043 	struct inet_timewait_sock *tw = inet_twsk(sk);
1044 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1045 	struct tcp_key key = {};
1046 	u8 tos = tw->tw_tos;
1047 
1048 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1049 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1050 	 * being placed in a different service queues (Classic rather than L4S)
1051 	 */
1052 	if (tw_status == TCP_TW_ACK_OOW)
1053 		tos &= ~INET_ECN_MASK;
1054 
1055 #ifdef CONFIG_TCP_AO
1056 	struct tcp_ao_info *ao_info;
1057 
1058 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1059 		/* FIXME: the segment to-be-acked is not verified yet */
1060 		ao_info = rcu_dereference(tcptw->ao_info);
1061 		if (ao_info) {
1062 			const struct tcp_ao_hdr *aoh;
1063 
1064 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1065 				inet_twsk_put(tw);
1066 				return;
1067 			}
1068 
1069 			if (aoh)
1070 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1071 								    aoh->rnext_keyid, -1);
1072 		}
1073 	}
1074 	if (key.ao_key) {
1075 		struct tcp_ao_key *rnext_key;
1076 
1077 		key.traffic_key = snd_other_key(key.ao_key);
1078 		key.sne = READ_ONCE(ao_info->snd_sne);
1079 		rnext_key = READ_ONCE(ao_info->rnext_key);
1080 		key.rcv_next = rnext_key->rcvid;
1081 		key.type = TCP_KEY_AO;
1082 #else
1083 	if (0) {
1084 #endif
1085 	} else if (static_branch_tcp_md5()) {
1086 		key.md5_key = tcp_twsk_md5_key(tcptw);
1087 		if (key.md5_key)
1088 			key.type = TCP_KEY_MD5;
1089 	}
1090 
1091 	tcp_v4_send_ack(sk, skb,
1092 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1093 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1094 			tcp_tw_tsval(tcptw),
1095 			READ_ONCE(tcptw->tw_ts_recent),
1096 			tw->tw_bound_dev_if, &key,
1097 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1098 			tos,
1099 			tw->tw_txhash);
1100 
1101 	inet_twsk_put(tw);
1102 }
1103 
1104 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1105 				  struct request_sock *req)
1106 {
1107 	struct tcp_key key = {};
1108 
1109 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1110 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1111 	 */
1112 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1113 					     tcp_sk(sk)->snd_nxt;
1114 
1115 #ifdef CONFIG_TCP_AO
1116 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1117 	    tcp_rsk_used_ao(req)) {
1118 		const union tcp_md5_addr *addr;
1119 		const struct tcp_ao_hdr *aoh;
1120 		int l3index;
1121 
1122 		/* Invalid TCP option size or twice included auth */
1123 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1124 			return;
1125 		if (!aoh)
1126 			return;
1127 
1128 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1129 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1130 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1131 					      aoh->rnext_keyid, -1);
1132 		if (unlikely(!key.ao_key)) {
1133 			/* Send ACK with any matching MKT for the peer */
1134 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1135 			/* Matching key disappeared (user removed the key?)
1136 			 * let the handshake timeout.
1137 			 */
1138 			if (!key.ao_key) {
1139 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1140 						     addr,
1141 						     ntohs(tcp_hdr(skb)->source),
1142 						     &ip_hdr(skb)->daddr,
1143 						     ntohs(tcp_hdr(skb)->dest));
1144 				return;
1145 			}
1146 		}
1147 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1148 		if (!key.traffic_key)
1149 			return;
1150 
1151 		key.type = TCP_KEY_AO;
1152 		key.rcv_next = aoh->keyid;
1153 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1154 #else
1155 	if (0) {
1156 #endif
1157 	} else if (static_branch_tcp_md5()) {
1158 		const union tcp_md5_addr *addr;
1159 		int l3index;
1160 
1161 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1162 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1163 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1164 		if (key.md5_key)
1165 			key.type = TCP_KEY_MD5;
1166 	}
1167 
1168 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1169 	tcp_v4_send_ack(sk, skb, seq,
1170 			tcp_rsk(req)->rcv_nxt,
1171 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1172 			tcp_rsk_tsval(tcp_rsk(req)),
1173 			req->ts_recent,
1174 			0, &key,
1175 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1176 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1177 			READ_ONCE(tcp_rsk(req)->txhash));
1178 	if (tcp_key_is_ao(&key))
1179 		kfree(key.traffic_key);
1180 }
1181 
1182 /*
1183  *	Send a SYN-ACK after having received a SYN.
1184  *	This still operates on a request_sock only, not on a big
1185  *	socket.
1186  */
1187 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1188 			      struct flowi *fl,
1189 			      struct request_sock *req,
1190 			      struct tcp_fastopen_cookie *foc,
1191 			      enum tcp_synack_type synack_type,
1192 			      struct sk_buff *syn_skb)
1193 {
1194 	const struct inet_request_sock *ireq = inet_rsk(req);
1195 	struct flowi4 fl4;
1196 	int err = -1;
1197 	struct sk_buff *skb;
1198 	u8 tos;
1199 
1200 	/* First, grab a route. */
1201 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1202 		return -1;
1203 
1204 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1205 
1206 	if (skb) {
1207 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1208 
1209 		tos = READ_ONCE(inet_sk(sk)->tos);
1210 
1211 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1212 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1213 			      (tos & INET_ECN_MASK);
1214 
1215 		if (!INET_ECN_is_capable(tos) &&
1216 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1217 			tos |= INET_ECN_ECT_0;
1218 
1219 		rcu_read_lock();
1220 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1221 					    ireq->ir_rmt_addr,
1222 					    rcu_dereference(ireq->ireq_opt),
1223 					    tos);
1224 		rcu_read_unlock();
1225 		err = net_xmit_eval(err);
1226 	}
1227 
1228 	return err;
1229 }
1230 
1231 /*
1232  *	IPv4 request_sock destructor.
1233  */
1234 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1235 {
1236 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1237 }
1238 
1239 #ifdef CONFIG_TCP_MD5SIG
1240 /*
1241  * RFC2385 MD5 checksumming requires a mapping of
1242  * IP address->MD5 Key.
1243  * We need to maintain these in the sk structure.
1244  */
1245 
1246 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1247 EXPORT_IPV6_MOD(tcp_md5_needed);
1248 
1249 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1250 {
1251 	if (!old)
1252 		return true;
1253 
1254 	/* l3index always overrides non-l3index */
1255 	if (old->l3index && new->l3index == 0)
1256 		return false;
1257 	if (old->l3index == 0 && new->l3index)
1258 		return true;
1259 
1260 	return old->prefixlen < new->prefixlen;
1261 }
1262 
1263 /* Find the Key structure for an address.  */
1264 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1265 					   const union tcp_md5_addr *addr,
1266 					   int family, bool any_l3index)
1267 {
1268 	const struct tcp_sock *tp = tcp_sk(sk);
1269 	struct tcp_md5sig_key *key;
1270 	const struct tcp_md5sig_info *md5sig;
1271 	__be32 mask;
1272 	struct tcp_md5sig_key *best_match = NULL;
1273 	bool match;
1274 
1275 	/* caller either holds rcu_read_lock() or socket lock */
1276 	md5sig = rcu_dereference_check(tp->md5sig_info,
1277 				       lockdep_sock_is_held(sk));
1278 	if (!md5sig)
1279 		return NULL;
1280 
1281 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1282 				 lockdep_sock_is_held(sk)) {
1283 		if (key->family != family)
1284 			continue;
1285 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1286 		    key->l3index != l3index)
1287 			continue;
1288 		if (family == AF_INET) {
1289 			mask = inet_make_mask(key->prefixlen);
1290 			match = (key->addr.a4.s_addr & mask) ==
1291 				(addr->a4.s_addr & mask);
1292 #if IS_ENABLED(CONFIG_IPV6)
1293 		} else if (family == AF_INET6) {
1294 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1295 						  key->prefixlen);
1296 #endif
1297 		} else {
1298 			match = false;
1299 		}
1300 
1301 		if (match && better_md5_match(best_match, key))
1302 			best_match = key;
1303 	}
1304 	return best_match;
1305 }
1306 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1307 
1308 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1309 						      const union tcp_md5_addr *addr,
1310 						      int family, u8 prefixlen,
1311 						      int l3index, u8 flags)
1312 {
1313 	const struct tcp_sock *tp = tcp_sk(sk);
1314 	struct tcp_md5sig_key *key;
1315 	unsigned int size = sizeof(struct in_addr);
1316 	const struct tcp_md5sig_info *md5sig;
1317 
1318 	/* caller either holds rcu_read_lock() or socket lock */
1319 	md5sig = rcu_dereference_check(tp->md5sig_info,
1320 				       lockdep_sock_is_held(sk));
1321 	if (!md5sig)
1322 		return NULL;
1323 #if IS_ENABLED(CONFIG_IPV6)
1324 	if (family == AF_INET6)
1325 		size = sizeof(struct in6_addr);
1326 #endif
1327 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1328 				 lockdep_sock_is_held(sk)) {
1329 		if (key->family != family)
1330 			continue;
1331 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1332 			continue;
1333 		if (key->l3index != l3index)
1334 			continue;
1335 		if (!memcmp(&key->addr, addr, size) &&
1336 		    key->prefixlen == prefixlen)
1337 			return key;
1338 	}
1339 	return NULL;
1340 }
1341 
1342 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1343 					 const struct sock *addr_sk)
1344 {
1345 	const union tcp_md5_addr *addr;
1346 	int l3index;
1347 
1348 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1349 						 addr_sk->sk_bound_dev_if);
1350 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1351 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1352 }
1353 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1354 
1355 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1356 {
1357 	struct tcp_sock *tp = tcp_sk(sk);
1358 	struct tcp_md5sig_info *md5sig;
1359 
1360 	md5sig = kmalloc(sizeof(*md5sig), gfp);
1361 	if (!md5sig)
1362 		return -ENOMEM;
1363 
1364 	sk_gso_disable(sk);
1365 	INIT_HLIST_HEAD(&md5sig->head);
1366 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1367 	return 0;
1368 }
1369 
1370 /* This can be called on a newly created socket, from other files */
1371 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1372 			    int family, u8 prefixlen, int l3index, u8 flags,
1373 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1374 {
1375 	/* Add Key to the list */
1376 	struct tcp_md5sig_key *key;
1377 	struct tcp_sock *tp = tcp_sk(sk);
1378 	struct tcp_md5sig_info *md5sig;
1379 
1380 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1381 	if (key) {
1382 		/* Pre-existing entry - just update that one.
1383 		 * Note that the key might be used concurrently.
1384 		 * data_race() is telling kcsan that we do not care of
1385 		 * key mismatches, since changing MD5 key on live flows
1386 		 * can lead to packet drops.
1387 		 */
1388 		data_race(memcpy(key->key, newkey, newkeylen));
1389 
1390 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1391 		 * Also note that a reader could catch new key->keylen value
1392 		 * but old key->key[], this is the reason we use __GFP_ZERO
1393 		 * at sock_kmalloc() time below these lines.
1394 		 */
1395 		WRITE_ONCE(key->keylen, newkeylen);
1396 
1397 		return 0;
1398 	}
1399 
1400 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1401 					   lockdep_sock_is_held(sk));
1402 
1403 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1404 	if (!key)
1405 		return -ENOMEM;
1406 
1407 	memcpy(key->key, newkey, newkeylen);
1408 	key->keylen = newkeylen;
1409 	key->family = family;
1410 	key->prefixlen = prefixlen;
1411 	key->l3index = l3index;
1412 	key->flags = flags;
1413 	memcpy(&key->addr, addr,
1414 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1415 								 sizeof(struct in_addr));
1416 	hlist_add_head_rcu(&key->node, &md5sig->head);
1417 	return 0;
1418 }
1419 
1420 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1421 		   int family, u8 prefixlen, int l3index, u8 flags,
1422 		   const u8 *newkey, u8 newkeylen)
1423 {
1424 	struct tcp_sock *tp = tcp_sk(sk);
1425 
1426 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1427 		if (tcp_md5_alloc_sigpool())
1428 			return -ENOMEM;
1429 
1430 		if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1431 			tcp_md5_release_sigpool();
1432 			return -ENOMEM;
1433 		}
1434 
1435 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1436 			struct tcp_md5sig_info *md5sig;
1437 
1438 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1439 			rcu_assign_pointer(tp->md5sig_info, NULL);
1440 			kfree_rcu(md5sig, rcu);
1441 			tcp_md5_release_sigpool();
1442 			return -EUSERS;
1443 		}
1444 	}
1445 
1446 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1447 				newkey, newkeylen, GFP_KERNEL);
1448 }
1449 EXPORT_IPV6_MOD(tcp_md5_do_add);
1450 
1451 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1452 		     int family, u8 prefixlen, int l3index,
1453 		     struct tcp_md5sig_key *key)
1454 {
1455 	struct tcp_sock *tp = tcp_sk(sk);
1456 
1457 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1458 		tcp_md5_add_sigpool();
1459 
1460 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1461 			tcp_md5_release_sigpool();
1462 			return -ENOMEM;
1463 		}
1464 
1465 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1466 			struct tcp_md5sig_info *md5sig;
1467 
1468 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1469 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1470 			rcu_assign_pointer(tp->md5sig_info, NULL);
1471 			kfree_rcu(md5sig, rcu);
1472 			tcp_md5_release_sigpool();
1473 			return -EUSERS;
1474 		}
1475 	}
1476 
1477 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1478 				key->flags, key->key, key->keylen,
1479 				sk_gfp_mask(sk, GFP_ATOMIC));
1480 }
1481 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1482 
1483 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1484 		   u8 prefixlen, int l3index, u8 flags)
1485 {
1486 	struct tcp_md5sig_key *key;
1487 
1488 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1489 	if (!key)
1490 		return -ENOENT;
1491 	hlist_del_rcu(&key->node);
1492 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1493 	kfree_rcu(key, rcu);
1494 	return 0;
1495 }
1496 EXPORT_IPV6_MOD(tcp_md5_do_del);
1497 
1498 void tcp_clear_md5_list(struct sock *sk)
1499 {
1500 	struct tcp_sock *tp = tcp_sk(sk);
1501 	struct tcp_md5sig_key *key;
1502 	struct hlist_node *n;
1503 	struct tcp_md5sig_info *md5sig;
1504 
1505 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1506 
1507 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1508 		hlist_del_rcu(&key->node);
1509 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1510 		kfree_rcu(key, rcu);
1511 	}
1512 }
1513 
1514 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1515 				 sockptr_t optval, int optlen)
1516 {
1517 	struct tcp_md5sig cmd;
1518 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1519 	const union tcp_md5_addr *addr;
1520 	u8 prefixlen = 32;
1521 	int l3index = 0;
1522 	bool l3flag;
1523 	u8 flags;
1524 
1525 	if (optlen < sizeof(cmd))
1526 		return -EINVAL;
1527 
1528 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1529 		return -EFAULT;
1530 
1531 	if (sin->sin_family != AF_INET)
1532 		return -EINVAL;
1533 
1534 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1535 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1536 
1537 	if (optname == TCP_MD5SIG_EXT &&
1538 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1539 		prefixlen = cmd.tcpm_prefixlen;
1540 		if (prefixlen > 32)
1541 			return -EINVAL;
1542 	}
1543 
1544 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1545 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1546 		struct net_device *dev;
1547 
1548 		rcu_read_lock();
1549 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1550 		if (dev && netif_is_l3_master(dev))
1551 			l3index = dev->ifindex;
1552 
1553 		rcu_read_unlock();
1554 
1555 		/* ok to reference set/not set outside of rcu;
1556 		 * right now device MUST be an L3 master
1557 		 */
1558 		if (!dev || !l3index)
1559 			return -EINVAL;
1560 	}
1561 
1562 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1563 
1564 	if (!cmd.tcpm_keylen)
1565 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1566 
1567 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1568 		return -EINVAL;
1569 
1570 	/* Don't allow keys for peers that have a matching TCP-AO key.
1571 	 * See the comment in tcp_ao_add_cmd()
1572 	 */
1573 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1574 		return -EKEYREJECTED;
1575 
1576 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1577 			      cmd.tcpm_key, cmd.tcpm_keylen);
1578 }
1579 
1580 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1581 				   __be32 daddr, __be32 saddr,
1582 				   const struct tcphdr *th, int nbytes)
1583 {
1584 	struct tcp4_pseudohdr *bp;
1585 	struct scatterlist sg;
1586 	struct tcphdr *_th;
1587 
1588 	bp = hp->scratch;
1589 	bp->saddr = saddr;
1590 	bp->daddr = daddr;
1591 	bp->pad = 0;
1592 	bp->protocol = IPPROTO_TCP;
1593 	bp->len = cpu_to_be16(nbytes);
1594 
1595 	_th = (struct tcphdr *)(bp + 1);
1596 	memcpy(_th, th, sizeof(*th));
1597 	_th->check = 0;
1598 
1599 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1600 	ahash_request_set_crypt(hp->req, &sg, NULL,
1601 				sizeof(*bp) + sizeof(*th));
1602 	return crypto_ahash_update(hp->req);
1603 }
1604 
1605 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1606 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1607 {
1608 	struct tcp_sigpool hp;
1609 
1610 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1611 		goto clear_hash_nostart;
1612 
1613 	if (crypto_ahash_init(hp.req))
1614 		goto clear_hash;
1615 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1616 		goto clear_hash;
1617 	if (tcp_md5_hash_key(&hp, key))
1618 		goto clear_hash;
1619 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1620 	if (crypto_ahash_final(hp.req))
1621 		goto clear_hash;
1622 
1623 	tcp_sigpool_end(&hp);
1624 	return 0;
1625 
1626 clear_hash:
1627 	tcp_sigpool_end(&hp);
1628 clear_hash_nostart:
1629 	memset(md5_hash, 0, 16);
1630 	return 1;
1631 }
1632 
1633 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1634 			const struct sock *sk,
1635 			const struct sk_buff *skb)
1636 {
1637 	const struct tcphdr *th = tcp_hdr(skb);
1638 	struct tcp_sigpool hp;
1639 	__be32 saddr, daddr;
1640 
1641 	if (sk) { /* valid for establish/request sockets */
1642 		saddr = sk->sk_rcv_saddr;
1643 		daddr = sk->sk_daddr;
1644 	} else {
1645 		const struct iphdr *iph = ip_hdr(skb);
1646 		saddr = iph->saddr;
1647 		daddr = iph->daddr;
1648 	}
1649 
1650 	if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1651 		goto clear_hash_nostart;
1652 
1653 	if (crypto_ahash_init(hp.req))
1654 		goto clear_hash;
1655 
1656 	if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1657 		goto clear_hash;
1658 	if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1659 		goto clear_hash;
1660 	if (tcp_md5_hash_key(&hp, key))
1661 		goto clear_hash;
1662 	ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1663 	if (crypto_ahash_final(hp.req))
1664 		goto clear_hash;
1665 
1666 	tcp_sigpool_end(&hp);
1667 	return 0;
1668 
1669 clear_hash:
1670 	tcp_sigpool_end(&hp);
1671 clear_hash_nostart:
1672 	memset(md5_hash, 0, 16);
1673 	return 1;
1674 }
1675 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1676 
1677 #endif
1678 
1679 static void tcp_v4_init_req(struct request_sock *req,
1680 			    const struct sock *sk_listener,
1681 			    struct sk_buff *skb)
1682 {
1683 	struct inet_request_sock *ireq = inet_rsk(req);
1684 	struct net *net = sock_net(sk_listener);
1685 
1686 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1687 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1688 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1689 }
1690 
1691 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1692 					  struct sk_buff *skb,
1693 					  struct flowi *fl,
1694 					  struct request_sock *req,
1695 					  u32 tw_isn)
1696 {
1697 	tcp_v4_init_req(req, sk, skb);
1698 
1699 	if (security_inet_conn_request(sk, skb, req))
1700 		return NULL;
1701 
1702 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1703 }
1704 
1705 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1706 	.family		=	PF_INET,
1707 	.obj_size	=	sizeof(struct tcp_request_sock),
1708 	.send_ack	=	tcp_v4_reqsk_send_ack,
1709 	.destructor	=	tcp_v4_reqsk_destructor,
1710 	.send_reset	=	tcp_v4_send_reset,
1711 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1712 };
1713 
1714 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1715 	.mss_clamp	=	TCP_MSS_DEFAULT,
1716 #ifdef CONFIG_TCP_MD5SIG
1717 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1718 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1719 #endif
1720 #ifdef CONFIG_TCP_AO
1721 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1722 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1723 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1724 #endif
1725 #ifdef CONFIG_SYN_COOKIES
1726 	.cookie_init_seq =	cookie_v4_init_sequence,
1727 #endif
1728 	.route_req	=	tcp_v4_route_req,
1729 	.init_seq	=	tcp_v4_init_seq,
1730 	.init_ts_off	=	tcp_v4_init_ts_off,
1731 	.send_synack	=	tcp_v4_send_synack,
1732 };
1733 
1734 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1735 {
1736 	/* Never answer to SYNs send to broadcast or multicast */
1737 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1738 		goto drop;
1739 
1740 	return tcp_conn_request(&tcp_request_sock_ops,
1741 				&tcp_request_sock_ipv4_ops, sk, skb);
1742 
1743 drop:
1744 	tcp_listendrop(sk);
1745 	return 0;
1746 }
1747 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1748 
1749 
1750 /*
1751  * The three way handshake has completed - we got a valid synack -
1752  * now create the new socket.
1753  */
1754 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1755 				  struct request_sock *req,
1756 				  struct dst_entry *dst,
1757 				  struct request_sock *req_unhash,
1758 				  bool *own_req)
1759 {
1760 	struct inet_request_sock *ireq;
1761 	bool found_dup_sk = false;
1762 	struct inet_sock *newinet;
1763 	struct tcp_sock *newtp;
1764 	struct sock *newsk;
1765 #ifdef CONFIG_TCP_MD5SIG
1766 	const union tcp_md5_addr *addr;
1767 	struct tcp_md5sig_key *key;
1768 	int l3index;
1769 #endif
1770 	struct ip_options_rcu *inet_opt;
1771 
1772 	if (sk_acceptq_is_full(sk))
1773 		goto exit_overflow;
1774 
1775 	newsk = tcp_create_openreq_child(sk, req, skb);
1776 	if (!newsk)
1777 		goto exit_nonewsk;
1778 
1779 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1780 	inet_sk_rx_dst_set(newsk, skb);
1781 
1782 	newtp		      = tcp_sk(newsk);
1783 	newinet		      = inet_sk(newsk);
1784 	ireq		      = inet_rsk(req);
1785 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1786 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1787 	newinet->mc_index     = inet_iif(skb);
1788 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1789 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1790 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1791 	if (inet_opt)
1792 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1793 	atomic_set(&newinet->inet_id, get_random_u16());
1794 
1795 	/* Set ToS of the new socket based upon the value of incoming SYN.
1796 	 * ECT bits are set later in tcp_init_transfer().
1797 	 */
1798 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1799 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1800 
1801 	if (!dst) {
1802 		dst = inet_csk_route_child_sock(sk, newsk, req);
1803 		if (!dst)
1804 			goto put_and_exit;
1805 	} else {
1806 		/* syncookie case : see end of cookie_v4_check() */
1807 	}
1808 	sk_setup_caps(newsk, dst);
1809 
1810 	tcp_ca_openreq_child(newsk, dst);
1811 
1812 	tcp_sync_mss(newsk, dst_mtu(dst));
1813 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1814 
1815 	tcp_initialize_rcv_mss(newsk);
1816 
1817 #ifdef CONFIG_TCP_MD5SIG
1818 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1819 	/* Copy over the MD5 key from the original socket */
1820 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1821 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1822 	if (key && !tcp_rsk_used_ao(req)) {
1823 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1824 			goto put_and_exit;
1825 		sk_gso_disable(newsk);
1826 	}
1827 #endif
1828 #ifdef CONFIG_TCP_AO
1829 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1830 		goto put_and_exit; /* OOM, release back memory */
1831 #endif
1832 
1833 	if (__inet_inherit_port(sk, newsk) < 0)
1834 		goto put_and_exit;
1835 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1836 				       &found_dup_sk);
1837 	if (likely(*own_req)) {
1838 		tcp_move_syn(newtp, req);
1839 		ireq->ireq_opt = NULL;
1840 	} else {
1841 		newinet->inet_opt = NULL;
1842 
1843 		if (!req_unhash && found_dup_sk) {
1844 			/* This code path should only be executed in the
1845 			 * syncookie case only
1846 			 */
1847 			bh_unlock_sock(newsk);
1848 			sock_put(newsk);
1849 			newsk = NULL;
1850 		}
1851 	}
1852 	return newsk;
1853 
1854 exit_overflow:
1855 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1856 exit_nonewsk:
1857 	dst_release(dst);
1858 exit:
1859 	tcp_listendrop(sk);
1860 	return NULL;
1861 put_and_exit:
1862 	newinet->inet_opt = NULL;
1863 	inet_csk_prepare_forced_close(newsk);
1864 	tcp_done(newsk);
1865 	goto exit;
1866 }
1867 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1868 
1869 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1870 {
1871 #ifdef CONFIG_SYN_COOKIES
1872 	const struct tcphdr *th = tcp_hdr(skb);
1873 
1874 	if (!th->syn)
1875 		sk = cookie_v4_check(sk, skb);
1876 #endif
1877 	return sk;
1878 }
1879 
1880 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1881 			 struct tcphdr *th, u32 *cookie)
1882 {
1883 	u16 mss = 0;
1884 #ifdef CONFIG_SYN_COOKIES
1885 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1886 				    &tcp_request_sock_ipv4_ops, sk, th);
1887 	if (mss) {
1888 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1889 		tcp_synq_overflow(sk);
1890 	}
1891 #endif
1892 	return mss;
1893 }
1894 
1895 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1896 							   u32));
1897 /* The socket must have it's spinlock held when we get
1898  * here, unless it is a TCP_LISTEN socket.
1899  *
1900  * We have a potential double-lock case here, so even when
1901  * doing backlog processing we use the BH locking scheme.
1902  * This is because we cannot sleep with the original spinlock
1903  * held.
1904  */
1905 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1906 {
1907 	enum skb_drop_reason reason;
1908 	struct sock *rsk;
1909 
1910 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1911 		struct dst_entry *dst;
1912 
1913 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1914 						lockdep_sock_is_held(sk));
1915 
1916 		sock_rps_save_rxhash(sk, skb);
1917 		sk_mark_napi_id(sk, skb);
1918 		if (dst) {
1919 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1920 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1921 					     dst, 0)) {
1922 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1923 				dst_release(dst);
1924 			}
1925 		}
1926 		tcp_rcv_established(sk, skb);
1927 		return 0;
1928 	}
1929 
1930 	if (tcp_checksum_complete(skb))
1931 		goto csum_err;
1932 
1933 	if (sk->sk_state == TCP_LISTEN) {
1934 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1935 
1936 		if (!nsk)
1937 			return 0;
1938 		if (nsk != sk) {
1939 			reason = tcp_child_process(sk, nsk, skb);
1940 			if (reason) {
1941 				rsk = nsk;
1942 				goto reset;
1943 			}
1944 			return 0;
1945 		}
1946 	} else
1947 		sock_rps_save_rxhash(sk, skb);
1948 
1949 	reason = tcp_rcv_state_process(sk, skb);
1950 	if (reason) {
1951 		rsk = sk;
1952 		goto reset;
1953 	}
1954 	return 0;
1955 
1956 reset:
1957 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1958 discard:
1959 	sk_skb_reason_drop(sk, skb, reason);
1960 	/* Be careful here. If this function gets more complicated and
1961 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1962 	 * might be destroyed here. This current version compiles correctly,
1963 	 * but you have been warned.
1964 	 */
1965 	return 0;
1966 
1967 csum_err:
1968 	reason = SKB_DROP_REASON_TCP_CSUM;
1969 	trace_tcp_bad_csum(skb);
1970 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1971 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1972 	goto discard;
1973 }
1974 EXPORT_SYMBOL(tcp_v4_do_rcv);
1975 
1976 int tcp_v4_early_demux(struct sk_buff *skb)
1977 {
1978 	struct net *net = dev_net_rcu(skb->dev);
1979 	const struct iphdr *iph;
1980 	const struct tcphdr *th;
1981 	struct sock *sk;
1982 
1983 	if (skb->pkt_type != PACKET_HOST)
1984 		return 0;
1985 
1986 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1987 		return 0;
1988 
1989 	iph = ip_hdr(skb);
1990 	th = tcp_hdr(skb);
1991 
1992 	if (th->doff < sizeof(struct tcphdr) / 4)
1993 		return 0;
1994 
1995 	sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1996 				       iph->saddr, th->source,
1997 				       iph->daddr, ntohs(th->dest),
1998 				       skb->skb_iif, inet_sdif(skb));
1999 	if (sk) {
2000 		skb->sk = sk;
2001 		skb->destructor = sock_edemux;
2002 		if (sk_fullsock(sk)) {
2003 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2004 
2005 			if (dst)
2006 				dst = dst_check(dst, 0);
2007 			if (dst &&
2008 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
2009 				skb_dst_set_noref(skb, dst);
2010 		}
2011 	}
2012 	return 0;
2013 }
2014 
2015 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2016 		     enum skb_drop_reason *reason)
2017 {
2018 	u32 tail_gso_size, tail_gso_segs;
2019 	struct skb_shared_info *shinfo;
2020 	const struct tcphdr *th;
2021 	struct tcphdr *thtail;
2022 	struct sk_buff *tail;
2023 	unsigned int hdrlen;
2024 	bool fragstolen;
2025 	u32 gso_segs;
2026 	u32 gso_size;
2027 	u64 limit;
2028 	int delta;
2029 	int err;
2030 
2031 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2032 	 * we can fix skb->truesize to its real value to avoid future drops.
2033 	 * This is valid because skb is not yet charged to the socket.
2034 	 * It has been noticed pure SACK packets were sometimes dropped
2035 	 * (if cooked by drivers without copybreak feature).
2036 	 */
2037 	skb_condense(skb);
2038 
2039 	tcp_cleanup_skb(skb);
2040 
2041 	if (unlikely(tcp_checksum_complete(skb))) {
2042 		bh_unlock_sock(sk);
2043 		trace_tcp_bad_csum(skb);
2044 		*reason = SKB_DROP_REASON_TCP_CSUM;
2045 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2046 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2047 		return true;
2048 	}
2049 
2050 	/* Attempt coalescing to last skb in backlog, even if we are
2051 	 * above the limits.
2052 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2053 	 */
2054 	th = (const struct tcphdr *)skb->data;
2055 	hdrlen = th->doff * 4;
2056 
2057 	tail = sk->sk_backlog.tail;
2058 	if (!tail)
2059 		goto no_coalesce;
2060 	thtail = (struct tcphdr *)tail->data;
2061 
2062 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2063 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2064 	    ((TCP_SKB_CB(tail)->tcp_flags |
2065 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2066 	    !((TCP_SKB_CB(tail)->tcp_flags &
2067 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2068 	    ((TCP_SKB_CB(tail)->tcp_flags ^
2069 	      TCP_SKB_CB(skb)->tcp_flags) &
2070 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2071 	    !tcp_skb_can_collapse_rx(tail, skb) ||
2072 	    thtail->doff != th->doff ||
2073 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2074 		goto no_coalesce;
2075 
2076 	__skb_pull(skb, hdrlen);
2077 
2078 	shinfo = skb_shinfo(skb);
2079 	gso_size = shinfo->gso_size ?: skb->len;
2080 	gso_segs = shinfo->gso_segs ?: 1;
2081 
2082 	shinfo = skb_shinfo(tail);
2083 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2084 	tail_gso_segs = shinfo->gso_segs ?: 1;
2085 
2086 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2087 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2088 
2089 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2090 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2091 			thtail->window = th->window;
2092 		}
2093 
2094 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2095 		 * thtail->fin, so that the fast path in tcp_rcv_established()
2096 		 * is not entered if we append a packet with a FIN.
2097 		 * SYN, RST, URG are not present.
2098 		 * ACK is set on both packets.
2099 		 * PSH : we do not really care in TCP stack,
2100 		 *       at least for 'GRO' packets.
2101 		 */
2102 		thtail->fin |= th->fin;
2103 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2104 
2105 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
2106 			TCP_SKB_CB(tail)->has_rxtstamp = true;
2107 			tail->tstamp = skb->tstamp;
2108 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2109 		}
2110 
2111 		/* Not as strict as GRO. We only need to carry mss max value */
2112 		shinfo->gso_size = max(gso_size, tail_gso_size);
2113 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2114 
2115 		sk->sk_backlog.len += delta;
2116 		__NET_INC_STATS(sock_net(sk),
2117 				LINUX_MIB_TCPBACKLOGCOALESCE);
2118 		kfree_skb_partial(skb, fragstolen);
2119 		return false;
2120 	}
2121 	__skb_push(skb, hdrlen);
2122 
2123 no_coalesce:
2124 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2125 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2126 	 * sk_rcvbuf in normal conditions.
2127 	 */
2128 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2129 
2130 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2131 
2132 	/* Only socket owner can try to collapse/prune rx queues
2133 	 * to reduce memory overhead, so add a little headroom here.
2134 	 * Few sockets backlog are possibly concurrently non empty.
2135 	 */
2136 	limit += 64 * 1024;
2137 
2138 	limit = min_t(u64, limit, UINT_MAX);
2139 
2140 	err = sk_add_backlog(sk, skb, limit);
2141 	if (unlikely(err)) {
2142 		bh_unlock_sock(sk);
2143 		if (err == -ENOMEM) {
2144 			*reason = SKB_DROP_REASON_PFMEMALLOC;
2145 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2146 		} else {
2147 			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2148 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2149 		}
2150 		return true;
2151 	}
2152 	return false;
2153 }
2154 EXPORT_IPV6_MOD(tcp_add_backlog);
2155 
2156 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
2157 {
2158 	struct tcphdr *th = (struct tcphdr *)skb->data;
2159 
2160 	return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
2161 }
2162 EXPORT_IPV6_MOD(tcp_filter);
2163 
2164 static void tcp_v4_restore_cb(struct sk_buff *skb)
2165 {
2166 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2167 		sizeof(struct inet_skb_parm));
2168 }
2169 
2170 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2171 			   const struct tcphdr *th)
2172 {
2173 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2174 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2175 	 */
2176 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2177 		sizeof(struct inet_skb_parm));
2178 	barrier();
2179 
2180 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2181 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2182 				    skb->len - th->doff * 4);
2183 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2184 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2185 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2186 	TCP_SKB_CB(skb)->sacked	 = 0;
2187 	TCP_SKB_CB(skb)->has_rxtstamp =
2188 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2189 }
2190 
2191 /*
2192  *	From tcp_input.c
2193  */
2194 
2195 int tcp_v4_rcv(struct sk_buff *skb)
2196 {
2197 	struct net *net = dev_net_rcu(skb->dev);
2198 	enum skb_drop_reason drop_reason;
2199 	enum tcp_tw_status tw_status;
2200 	int sdif = inet_sdif(skb);
2201 	int dif = inet_iif(skb);
2202 	const struct iphdr *iph;
2203 	const struct tcphdr *th;
2204 	struct sock *sk = NULL;
2205 	bool refcounted;
2206 	int ret;
2207 	u32 isn;
2208 
2209 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2210 	if (skb->pkt_type != PACKET_HOST)
2211 		goto discard_it;
2212 
2213 	/* Count it even if it's bad */
2214 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2215 
2216 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2217 		goto discard_it;
2218 
2219 	th = (const struct tcphdr *)skb->data;
2220 
2221 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2222 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2223 		goto bad_packet;
2224 	}
2225 	if (!pskb_may_pull(skb, th->doff * 4))
2226 		goto discard_it;
2227 
2228 	/* An explanation is required here, I think.
2229 	 * Packet length and doff are validated by header prediction,
2230 	 * provided case of th->doff==0 is eliminated.
2231 	 * So, we defer the checks. */
2232 
2233 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2234 		goto csum_error;
2235 
2236 	th = (const struct tcphdr *)skb->data;
2237 	iph = ip_hdr(skb);
2238 lookup:
2239 	sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2240 			       skb, __tcp_hdrlen(th), th->source,
2241 			       th->dest, sdif, &refcounted);
2242 	if (!sk)
2243 		goto no_tcp_socket;
2244 
2245 	if (sk->sk_state == TCP_TIME_WAIT)
2246 		goto do_time_wait;
2247 
2248 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2249 		struct request_sock *req = inet_reqsk(sk);
2250 		bool req_stolen = false;
2251 		struct sock *nsk;
2252 
2253 		sk = req->rsk_listener;
2254 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2255 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2256 		else
2257 			drop_reason = tcp_inbound_hash(sk, req, skb,
2258 						       &iph->saddr, &iph->daddr,
2259 						       AF_INET, dif, sdif);
2260 		if (unlikely(drop_reason)) {
2261 			sk_drops_add(sk, skb);
2262 			reqsk_put(req);
2263 			goto discard_it;
2264 		}
2265 		if (tcp_checksum_complete(skb)) {
2266 			reqsk_put(req);
2267 			goto csum_error;
2268 		}
2269 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2270 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2271 			if (!nsk) {
2272 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2273 				goto lookup;
2274 			}
2275 			sk = nsk;
2276 			/* reuseport_migrate_sock() has already held one sk_refcnt
2277 			 * before returning.
2278 			 */
2279 		} else {
2280 			/* We own a reference on the listener, increase it again
2281 			 * as we might lose it too soon.
2282 			 */
2283 			sock_hold(sk);
2284 		}
2285 		refcounted = true;
2286 		nsk = NULL;
2287 		if (!tcp_filter(sk, skb, &drop_reason)) {
2288 			th = (const struct tcphdr *)skb->data;
2289 			iph = ip_hdr(skb);
2290 			tcp_v4_fill_cb(skb, iph, th);
2291 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2292 					    &drop_reason);
2293 		}
2294 		if (!nsk) {
2295 			reqsk_put(req);
2296 			if (req_stolen) {
2297 				/* Another cpu got exclusive access to req
2298 				 * and created a full blown socket.
2299 				 * Try to feed this packet to this socket
2300 				 * instead of discarding it.
2301 				 */
2302 				tcp_v4_restore_cb(skb);
2303 				sock_put(sk);
2304 				goto lookup;
2305 			}
2306 			goto discard_and_relse;
2307 		}
2308 		nf_reset_ct(skb);
2309 		if (nsk == sk) {
2310 			reqsk_put(req);
2311 			tcp_v4_restore_cb(skb);
2312 		} else {
2313 			drop_reason = tcp_child_process(sk, nsk, skb);
2314 			if (drop_reason) {
2315 				enum sk_rst_reason rst_reason;
2316 
2317 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2318 				tcp_v4_send_reset(nsk, skb, rst_reason);
2319 				goto discard_and_relse;
2320 			}
2321 			sock_put(sk);
2322 			return 0;
2323 		}
2324 	}
2325 
2326 process:
2327 	if (static_branch_unlikely(&ip4_min_ttl)) {
2328 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2329 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2330 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2331 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2332 			goto discard_and_relse;
2333 		}
2334 	}
2335 
2336 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2337 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2338 		goto discard_and_relse;
2339 	}
2340 
2341 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2342 				       AF_INET, dif, sdif);
2343 	if (drop_reason)
2344 		goto discard_and_relse;
2345 
2346 	nf_reset_ct(skb);
2347 
2348 	if (tcp_filter(sk, skb, &drop_reason))
2349 		goto discard_and_relse;
2350 
2351 	th = (const struct tcphdr *)skb->data;
2352 	iph = ip_hdr(skb);
2353 	tcp_v4_fill_cb(skb, iph, th);
2354 
2355 	skb->dev = NULL;
2356 
2357 	if (sk->sk_state == TCP_LISTEN) {
2358 		ret = tcp_v4_do_rcv(sk, skb);
2359 		goto put_and_return;
2360 	}
2361 
2362 	sk_incoming_cpu_update(sk);
2363 
2364 	bh_lock_sock_nested(sk);
2365 	tcp_segs_in(tcp_sk(sk), skb);
2366 	ret = 0;
2367 	if (!sock_owned_by_user(sk)) {
2368 		ret = tcp_v4_do_rcv(sk, skb);
2369 	} else {
2370 		if (tcp_add_backlog(sk, skb, &drop_reason))
2371 			goto discard_and_relse;
2372 	}
2373 	bh_unlock_sock(sk);
2374 
2375 put_and_return:
2376 	if (refcounted)
2377 		sock_put(sk);
2378 
2379 	return ret;
2380 
2381 no_tcp_socket:
2382 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2383 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2384 		goto discard_it;
2385 
2386 	tcp_v4_fill_cb(skb, iph, th);
2387 
2388 	if (tcp_checksum_complete(skb)) {
2389 csum_error:
2390 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2391 		trace_tcp_bad_csum(skb);
2392 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2393 bad_packet:
2394 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2395 	} else {
2396 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2397 	}
2398 
2399 discard_it:
2400 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2401 	/* Discard frame. */
2402 	sk_skb_reason_drop(sk, skb, drop_reason);
2403 	return 0;
2404 
2405 discard_and_relse:
2406 	sk_drops_add(sk, skb);
2407 	if (refcounted)
2408 		sock_put(sk);
2409 	goto discard_it;
2410 
2411 do_time_wait:
2412 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2413 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2414 		inet_twsk_put(inet_twsk(sk));
2415 		goto discard_it;
2416 	}
2417 
2418 	tcp_v4_fill_cb(skb, iph, th);
2419 
2420 	if (tcp_checksum_complete(skb)) {
2421 		inet_twsk_put(inet_twsk(sk));
2422 		goto csum_error;
2423 	}
2424 
2425 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2426 					       &drop_reason);
2427 	switch (tw_status) {
2428 	case TCP_TW_SYN: {
2429 		struct sock *sk2 = inet_lookup_listener(net,
2430 							net->ipv4.tcp_death_row.hashinfo,
2431 							skb, __tcp_hdrlen(th),
2432 							iph->saddr, th->source,
2433 							iph->daddr, th->dest,
2434 							inet_iif(skb),
2435 							sdif);
2436 		if (sk2) {
2437 			inet_twsk_deschedule_put(inet_twsk(sk));
2438 			sk = sk2;
2439 			tcp_v4_restore_cb(skb);
2440 			refcounted = false;
2441 			__this_cpu_write(tcp_tw_isn, isn);
2442 			goto process;
2443 		}
2444 	}
2445 		/* to ACK */
2446 		fallthrough;
2447 	case TCP_TW_ACK:
2448 	case TCP_TW_ACK_OOW:
2449 		tcp_v4_timewait_ack(sk, skb, tw_status);
2450 		break;
2451 	case TCP_TW_RST:
2452 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2453 		inet_twsk_deschedule_put(inet_twsk(sk));
2454 		goto discard_it;
2455 	case TCP_TW_SUCCESS:;
2456 	}
2457 	goto discard_it;
2458 }
2459 
2460 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2461 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2462 	.twsk_destructor= tcp_twsk_destructor,
2463 };
2464 
2465 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2466 {
2467 	struct dst_entry *dst = skb_dst(skb);
2468 
2469 	if (dst && dst_hold_safe(dst)) {
2470 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2471 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2472 	}
2473 }
2474 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2475 
2476 const struct inet_connection_sock_af_ops ipv4_specific = {
2477 	.queue_xmit	   = ip_queue_xmit,
2478 	.send_check	   = tcp_v4_send_check,
2479 	.rebuild_header	   = inet_sk_rebuild_header,
2480 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2481 	.conn_request	   = tcp_v4_conn_request,
2482 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2483 	.net_header_len	   = sizeof(struct iphdr),
2484 	.setsockopt	   = ip_setsockopt,
2485 	.getsockopt	   = ip_getsockopt,
2486 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2487 };
2488 EXPORT_IPV6_MOD(ipv4_specific);
2489 
2490 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2491 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2492 #ifdef CONFIG_TCP_MD5SIG
2493 	.md5_lookup		= tcp_v4_md5_lookup,
2494 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2495 	.md5_parse		= tcp_v4_parse_md5_keys,
2496 #endif
2497 #ifdef CONFIG_TCP_AO
2498 	.ao_lookup		= tcp_v4_ao_lookup,
2499 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2500 	.ao_parse		= tcp_v4_parse_ao,
2501 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2502 #endif
2503 };
2504 #endif
2505 
2506 /* NOTE: A lot of things set to zero explicitly by call to
2507  *       sk_alloc() so need not be done here.
2508  */
2509 static int tcp_v4_init_sock(struct sock *sk)
2510 {
2511 	struct inet_connection_sock *icsk = inet_csk(sk);
2512 
2513 	tcp_init_sock(sk);
2514 
2515 	icsk->icsk_af_ops = &ipv4_specific;
2516 
2517 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2518 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2519 #endif
2520 
2521 	return 0;
2522 }
2523 
2524 #ifdef CONFIG_TCP_MD5SIG
2525 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2526 {
2527 	struct tcp_md5sig_info *md5sig;
2528 
2529 	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2530 	kfree(md5sig);
2531 	static_branch_slow_dec_deferred(&tcp_md5_needed);
2532 	tcp_md5_release_sigpool();
2533 }
2534 #endif
2535 
2536 static void tcp_release_user_frags(struct sock *sk)
2537 {
2538 #ifdef CONFIG_PAGE_POOL
2539 	unsigned long index;
2540 	void *netmem;
2541 
2542 	xa_for_each(&sk->sk_user_frags, index, netmem)
2543 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2544 #endif
2545 }
2546 
2547 void tcp_v4_destroy_sock(struct sock *sk)
2548 {
2549 	struct tcp_sock *tp = tcp_sk(sk);
2550 
2551 	tcp_release_user_frags(sk);
2552 
2553 	xa_destroy(&sk->sk_user_frags);
2554 
2555 	trace_tcp_destroy_sock(sk);
2556 
2557 	tcp_clear_xmit_timers(sk);
2558 
2559 	tcp_cleanup_congestion_control(sk);
2560 
2561 	tcp_cleanup_ulp(sk);
2562 
2563 	/* Cleanup up the write buffer. */
2564 	tcp_write_queue_purge(sk);
2565 
2566 	/* Check if we want to disable active TFO */
2567 	tcp_fastopen_active_disable_ofo_check(sk);
2568 
2569 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2570 	skb_rbtree_purge(&tp->out_of_order_queue);
2571 
2572 #ifdef CONFIG_TCP_MD5SIG
2573 	/* Clean up the MD5 key list, if any */
2574 	if (tp->md5sig_info) {
2575 		struct tcp_md5sig_info *md5sig;
2576 
2577 		md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2578 		tcp_clear_md5_list(sk);
2579 		call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2580 		rcu_assign_pointer(tp->md5sig_info, NULL);
2581 	}
2582 #endif
2583 	tcp_ao_destroy_sock(sk, false);
2584 
2585 	/* Clean up a referenced TCP bind bucket. */
2586 	if (inet_csk(sk)->icsk_bind_hash)
2587 		inet_put_port(sk);
2588 
2589 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2590 
2591 	/* If socket is aborted during connect operation */
2592 	tcp_free_fastopen_req(tp);
2593 	tcp_fastopen_destroy_cipher(sk);
2594 	tcp_saved_syn_free(tp);
2595 
2596 	sk_sockets_allocated_dec(sk);
2597 }
2598 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2599 
2600 #ifdef CONFIG_PROC_FS
2601 /* Proc filesystem TCP sock list dumping. */
2602 
2603 static unsigned short seq_file_family(const struct seq_file *seq);
2604 
2605 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2606 {
2607 	unsigned short family = seq_file_family(seq);
2608 
2609 	/* AF_UNSPEC is used as a match all */
2610 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2611 		net_eq(sock_net(sk), seq_file_net(seq)));
2612 }
2613 
2614 /* Find a non empty bucket (starting from st->bucket)
2615  * and return the first sk from it.
2616  */
2617 static void *listening_get_first(struct seq_file *seq)
2618 {
2619 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2620 	struct tcp_iter_state *st = seq->private;
2621 
2622 	st->offset = 0;
2623 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2624 		struct inet_listen_hashbucket *ilb2;
2625 		struct hlist_nulls_node *node;
2626 		struct sock *sk;
2627 
2628 		ilb2 = &hinfo->lhash2[st->bucket];
2629 		if (hlist_nulls_empty(&ilb2->nulls_head))
2630 			continue;
2631 
2632 		spin_lock(&ilb2->lock);
2633 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2634 			if (seq_sk_match(seq, sk))
2635 				return sk;
2636 		}
2637 		spin_unlock(&ilb2->lock);
2638 	}
2639 
2640 	return NULL;
2641 }
2642 
2643 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2644  * If "cur" is the last one in the st->bucket,
2645  * call listening_get_first() to return the first sk of the next
2646  * non empty bucket.
2647  */
2648 static void *listening_get_next(struct seq_file *seq, void *cur)
2649 {
2650 	struct tcp_iter_state *st = seq->private;
2651 	struct inet_listen_hashbucket *ilb2;
2652 	struct hlist_nulls_node *node;
2653 	struct inet_hashinfo *hinfo;
2654 	struct sock *sk = cur;
2655 
2656 	++st->num;
2657 	++st->offset;
2658 
2659 	sk = sk_nulls_next(sk);
2660 	sk_nulls_for_each_from(sk, node) {
2661 		if (seq_sk_match(seq, sk))
2662 			return sk;
2663 	}
2664 
2665 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2666 	ilb2 = &hinfo->lhash2[st->bucket];
2667 	spin_unlock(&ilb2->lock);
2668 	++st->bucket;
2669 	return listening_get_first(seq);
2670 }
2671 
2672 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2673 {
2674 	struct tcp_iter_state *st = seq->private;
2675 	void *rc;
2676 
2677 	st->bucket = 0;
2678 	st->offset = 0;
2679 	rc = listening_get_first(seq);
2680 
2681 	while (rc && *pos) {
2682 		rc = listening_get_next(seq, rc);
2683 		--*pos;
2684 	}
2685 	return rc;
2686 }
2687 
2688 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2689 				const struct tcp_iter_state *st)
2690 {
2691 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2692 }
2693 
2694 /*
2695  * Get first established socket starting from bucket given in st->bucket.
2696  * If st->bucket is zero, the very first socket in the hash is returned.
2697  */
2698 static void *established_get_first(struct seq_file *seq)
2699 {
2700 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2701 	struct tcp_iter_state *st = seq->private;
2702 
2703 	st->offset = 0;
2704 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2705 		struct sock *sk;
2706 		struct hlist_nulls_node *node;
2707 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2708 
2709 		cond_resched();
2710 
2711 		/* Lockless fast path for the common case of empty buckets */
2712 		if (empty_bucket(hinfo, st))
2713 			continue;
2714 
2715 		spin_lock_bh(lock);
2716 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2717 			if (seq_sk_match(seq, sk))
2718 				return sk;
2719 		}
2720 		spin_unlock_bh(lock);
2721 	}
2722 
2723 	return NULL;
2724 }
2725 
2726 static void *established_get_next(struct seq_file *seq, void *cur)
2727 {
2728 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2729 	struct tcp_iter_state *st = seq->private;
2730 	struct hlist_nulls_node *node;
2731 	struct sock *sk = cur;
2732 
2733 	++st->num;
2734 	++st->offset;
2735 
2736 	sk = sk_nulls_next(sk);
2737 
2738 	sk_nulls_for_each_from(sk, node) {
2739 		if (seq_sk_match(seq, sk))
2740 			return sk;
2741 	}
2742 
2743 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2744 	++st->bucket;
2745 	return established_get_first(seq);
2746 }
2747 
2748 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2749 {
2750 	struct tcp_iter_state *st = seq->private;
2751 	void *rc;
2752 
2753 	st->bucket = 0;
2754 	rc = established_get_first(seq);
2755 
2756 	while (rc && pos) {
2757 		rc = established_get_next(seq, rc);
2758 		--pos;
2759 	}
2760 	return rc;
2761 }
2762 
2763 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2764 {
2765 	void *rc;
2766 	struct tcp_iter_state *st = seq->private;
2767 
2768 	st->state = TCP_SEQ_STATE_LISTENING;
2769 	rc	  = listening_get_idx(seq, &pos);
2770 
2771 	if (!rc) {
2772 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2773 		rc	  = established_get_idx(seq, pos);
2774 	}
2775 
2776 	return rc;
2777 }
2778 
2779 static void *tcp_seek_last_pos(struct seq_file *seq)
2780 {
2781 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2782 	struct tcp_iter_state *st = seq->private;
2783 	int bucket = st->bucket;
2784 	int offset = st->offset;
2785 	int orig_num = st->num;
2786 	void *rc = NULL;
2787 
2788 	switch (st->state) {
2789 	case TCP_SEQ_STATE_LISTENING:
2790 		if (st->bucket > hinfo->lhash2_mask)
2791 			break;
2792 		rc = listening_get_first(seq);
2793 		while (offset-- && rc && bucket == st->bucket)
2794 			rc = listening_get_next(seq, rc);
2795 		if (rc)
2796 			break;
2797 		st->bucket = 0;
2798 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2799 		fallthrough;
2800 	case TCP_SEQ_STATE_ESTABLISHED:
2801 		if (st->bucket > hinfo->ehash_mask)
2802 			break;
2803 		rc = established_get_first(seq);
2804 		while (offset-- && rc && bucket == st->bucket)
2805 			rc = established_get_next(seq, rc);
2806 	}
2807 
2808 	st->num = orig_num;
2809 
2810 	return rc;
2811 }
2812 
2813 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2814 {
2815 	struct tcp_iter_state *st = seq->private;
2816 	void *rc;
2817 
2818 	if (*pos && *pos == st->last_pos) {
2819 		rc = tcp_seek_last_pos(seq);
2820 		if (rc)
2821 			goto out;
2822 	}
2823 
2824 	st->state = TCP_SEQ_STATE_LISTENING;
2825 	st->num = 0;
2826 	st->bucket = 0;
2827 	st->offset = 0;
2828 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2829 
2830 out:
2831 	st->last_pos = *pos;
2832 	return rc;
2833 }
2834 EXPORT_IPV6_MOD(tcp_seq_start);
2835 
2836 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2837 {
2838 	struct tcp_iter_state *st = seq->private;
2839 	void *rc = NULL;
2840 
2841 	if (v == SEQ_START_TOKEN) {
2842 		rc = tcp_get_idx(seq, 0);
2843 		goto out;
2844 	}
2845 
2846 	switch (st->state) {
2847 	case TCP_SEQ_STATE_LISTENING:
2848 		rc = listening_get_next(seq, v);
2849 		if (!rc) {
2850 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2851 			st->bucket = 0;
2852 			st->offset = 0;
2853 			rc	  = established_get_first(seq);
2854 		}
2855 		break;
2856 	case TCP_SEQ_STATE_ESTABLISHED:
2857 		rc = established_get_next(seq, v);
2858 		break;
2859 	}
2860 out:
2861 	++*pos;
2862 	st->last_pos = *pos;
2863 	return rc;
2864 }
2865 EXPORT_IPV6_MOD(tcp_seq_next);
2866 
2867 void tcp_seq_stop(struct seq_file *seq, void *v)
2868 {
2869 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2870 	struct tcp_iter_state *st = seq->private;
2871 
2872 	switch (st->state) {
2873 	case TCP_SEQ_STATE_LISTENING:
2874 		if (v != SEQ_START_TOKEN)
2875 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2876 		break;
2877 	case TCP_SEQ_STATE_ESTABLISHED:
2878 		if (v)
2879 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2880 		break;
2881 	}
2882 }
2883 EXPORT_IPV6_MOD(tcp_seq_stop);
2884 
2885 static void get_openreq4(const struct request_sock *req,
2886 			 struct seq_file *f, int i)
2887 {
2888 	const struct inet_request_sock *ireq = inet_rsk(req);
2889 	long delta = req->rsk_timer.expires - jiffies;
2890 
2891 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2892 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2893 		i,
2894 		ireq->ir_loc_addr,
2895 		ireq->ir_num,
2896 		ireq->ir_rmt_addr,
2897 		ntohs(ireq->ir_rmt_port),
2898 		TCP_SYN_RECV,
2899 		0, 0, /* could print option size, but that is af dependent. */
2900 		1,    /* timers active (only the expire timer) */
2901 		jiffies_delta_to_clock_t(delta),
2902 		req->num_timeout,
2903 		from_kuid_munged(seq_user_ns(f),
2904 				 sk_uid(req->rsk_listener)),
2905 		0,  /* non standard timer */
2906 		0, /* open_requests have no inode */
2907 		0,
2908 		req);
2909 }
2910 
2911 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2912 {
2913 	int timer_active;
2914 	unsigned long timer_expires;
2915 	const struct tcp_sock *tp = tcp_sk(sk);
2916 	const struct inet_connection_sock *icsk = inet_csk(sk);
2917 	const struct inet_sock *inet = inet_sk(sk);
2918 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2919 	__be32 dest = inet->inet_daddr;
2920 	__be32 src = inet->inet_rcv_saddr;
2921 	__u16 destp = ntohs(inet->inet_dport);
2922 	__u16 srcp = ntohs(inet->inet_sport);
2923 	u8 icsk_pending;
2924 	int rx_queue;
2925 	int state;
2926 
2927 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2928 	if (icsk_pending == ICSK_TIME_RETRANS ||
2929 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2930 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2931 		timer_active	= 1;
2932 		timer_expires	= icsk_timeout(icsk);
2933 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2934 		timer_active	= 4;
2935 		timer_expires	= icsk_timeout(icsk);
2936 	} else if (timer_pending(&sk->sk_timer)) {
2937 		timer_active	= 2;
2938 		timer_expires	= sk->sk_timer.expires;
2939 	} else {
2940 		timer_active	= 0;
2941 		timer_expires = jiffies;
2942 	}
2943 
2944 	state = inet_sk_state_load(sk);
2945 	if (state == TCP_LISTEN)
2946 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2947 	else
2948 		/* Because we don't lock the socket,
2949 		 * we might find a transient negative value.
2950 		 */
2951 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2952 				      READ_ONCE(tp->copied_seq), 0);
2953 
2954 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2955 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2956 		i, src, srcp, dest, destp, state,
2957 		READ_ONCE(tp->write_seq) - tp->snd_una,
2958 		rx_queue,
2959 		timer_active,
2960 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2961 		icsk->icsk_retransmits,
2962 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2963 		icsk->icsk_probes_out,
2964 		sock_i_ino(sk),
2965 		refcount_read(&sk->sk_refcnt), sk,
2966 		jiffies_to_clock_t(icsk->icsk_rto),
2967 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2968 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2969 		tcp_snd_cwnd(tp),
2970 		state == TCP_LISTEN ?
2971 		    fastopenq->max_qlen :
2972 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2973 }
2974 
2975 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2976 			       struct seq_file *f, int i)
2977 {
2978 	long delta = tw->tw_timer.expires - jiffies;
2979 	__be32 dest, src;
2980 	__u16 destp, srcp;
2981 
2982 	dest  = tw->tw_daddr;
2983 	src   = tw->tw_rcv_saddr;
2984 	destp = ntohs(tw->tw_dport);
2985 	srcp  = ntohs(tw->tw_sport);
2986 
2987 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2988 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2989 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2990 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2991 		refcount_read(&tw->tw_refcnt), tw);
2992 }
2993 
2994 #define TMPSZ 150
2995 
2996 static int tcp4_seq_show(struct seq_file *seq, void *v)
2997 {
2998 	struct tcp_iter_state *st;
2999 	struct sock *sk = v;
3000 
3001 	seq_setwidth(seq, TMPSZ - 1);
3002 	if (v == SEQ_START_TOKEN) {
3003 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
3004 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
3005 			   "inode");
3006 		goto out;
3007 	}
3008 	st = seq->private;
3009 
3010 	if (sk->sk_state == TCP_TIME_WAIT)
3011 		get_timewait4_sock(v, seq, st->num);
3012 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
3013 		get_openreq4(v, seq, st->num);
3014 	else
3015 		get_tcp4_sock(v, seq, st->num);
3016 out:
3017 	seq_pad(seq, '\n');
3018 	return 0;
3019 }
3020 
3021 #ifdef CONFIG_BPF_SYSCALL
3022 union bpf_tcp_iter_batch_item {
3023 	struct sock *sk;
3024 	__u64 cookie;
3025 };
3026 
3027 struct bpf_tcp_iter_state {
3028 	struct tcp_iter_state state;
3029 	unsigned int cur_sk;
3030 	unsigned int end_sk;
3031 	unsigned int max_sk;
3032 	union bpf_tcp_iter_batch_item *batch;
3033 };
3034 
3035 struct bpf_iter__tcp {
3036 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3037 	__bpf_md_ptr(struct sock_common *, sk_common);
3038 	uid_t uid __aligned(8);
3039 };
3040 
3041 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3042 			     struct sock_common *sk_common, uid_t uid)
3043 {
3044 	struct bpf_iter__tcp ctx;
3045 
3046 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3047 	ctx.meta = meta;
3048 	ctx.sk_common = sk_common;
3049 	ctx.uid = uid;
3050 	return bpf_iter_run_prog(prog, &ctx);
3051 }
3052 
3053 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3054 {
3055 	union bpf_tcp_iter_batch_item *item;
3056 	unsigned int cur_sk = iter->cur_sk;
3057 	__u64 cookie;
3058 
3059 	/* Remember the cookies of the sockets we haven't seen yet, so we can
3060 	 * pick up where we left off next time around.
3061 	 */
3062 	while (cur_sk < iter->end_sk) {
3063 		item = &iter->batch[cur_sk++];
3064 		cookie = sock_gen_cookie(item->sk);
3065 		sock_gen_put(item->sk);
3066 		item->cookie = cookie;
3067 	}
3068 }
3069 
3070 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3071 				      unsigned int new_batch_sz, gfp_t flags)
3072 {
3073 	union bpf_tcp_iter_batch_item *new_batch;
3074 
3075 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3076 			     flags | __GFP_NOWARN);
3077 	if (!new_batch)
3078 		return -ENOMEM;
3079 
3080 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3081 	kvfree(iter->batch);
3082 	iter->batch = new_batch;
3083 	iter->max_sk = new_batch_sz;
3084 
3085 	return 0;
3086 }
3087 
3088 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3089 					       union bpf_tcp_iter_batch_item *cookies,
3090 					       int n_cookies)
3091 {
3092 	struct hlist_nulls_node *node;
3093 	struct sock *sk;
3094 	int i;
3095 
3096 	for (i = 0; i < n_cookies; i++) {
3097 		sk = first_sk;
3098 		sk_nulls_for_each_from(sk, node)
3099 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3100 				return sk;
3101 	}
3102 
3103 	return NULL;
3104 }
3105 
3106 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3107 {
3108 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3109 	struct bpf_tcp_iter_state *iter = seq->private;
3110 	struct tcp_iter_state *st = &iter->state;
3111 	unsigned int find_cookie = iter->cur_sk;
3112 	unsigned int end_cookie = iter->end_sk;
3113 	int resume_bucket = st->bucket;
3114 	struct sock *sk;
3115 
3116 	if (end_cookie && find_cookie == end_cookie)
3117 		++st->bucket;
3118 
3119 	sk = listening_get_first(seq);
3120 	iter->cur_sk = 0;
3121 	iter->end_sk = 0;
3122 
3123 	if (sk && st->bucket == resume_bucket && end_cookie) {
3124 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3125 						end_cookie - find_cookie);
3126 		if (!sk) {
3127 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
3128 			++st->bucket;
3129 			sk = listening_get_first(seq);
3130 		}
3131 	}
3132 
3133 	return sk;
3134 }
3135 
3136 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3137 {
3138 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3139 	struct bpf_tcp_iter_state *iter = seq->private;
3140 	struct tcp_iter_state *st = &iter->state;
3141 	unsigned int find_cookie = iter->cur_sk;
3142 	unsigned int end_cookie = iter->end_sk;
3143 	int resume_bucket = st->bucket;
3144 	struct sock *sk;
3145 
3146 	if (end_cookie && find_cookie == end_cookie)
3147 		++st->bucket;
3148 
3149 	sk = established_get_first(seq);
3150 	iter->cur_sk = 0;
3151 	iter->end_sk = 0;
3152 
3153 	if (sk && st->bucket == resume_bucket && end_cookie) {
3154 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3155 						end_cookie - find_cookie);
3156 		if (!sk) {
3157 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3158 			++st->bucket;
3159 			sk = established_get_first(seq);
3160 		}
3161 	}
3162 
3163 	return sk;
3164 }
3165 
3166 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3167 {
3168 	struct bpf_tcp_iter_state *iter = seq->private;
3169 	struct tcp_iter_state *st = &iter->state;
3170 	struct sock *sk = NULL;
3171 
3172 	switch (st->state) {
3173 	case TCP_SEQ_STATE_LISTENING:
3174 		sk = bpf_iter_tcp_resume_listening(seq);
3175 		if (sk)
3176 			break;
3177 		st->bucket = 0;
3178 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3179 		fallthrough;
3180 	case TCP_SEQ_STATE_ESTABLISHED:
3181 		sk = bpf_iter_tcp_resume_established(seq);
3182 		break;
3183 	}
3184 
3185 	return sk;
3186 }
3187 
3188 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3189 						 struct sock **start_sk)
3190 {
3191 	struct bpf_tcp_iter_state *iter = seq->private;
3192 	struct hlist_nulls_node *node;
3193 	unsigned int expected = 1;
3194 	struct sock *sk;
3195 
3196 	sock_hold(*start_sk);
3197 	iter->batch[iter->end_sk++].sk = *start_sk;
3198 
3199 	sk = sk_nulls_next(*start_sk);
3200 	*start_sk = NULL;
3201 	sk_nulls_for_each_from(sk, node) {
3202 		if (seq_sk_match(seq, sk)) {
3203 			if (iter->end_sk < iter->max_sk) {
3204 				sock_hold(sk);
3205 				iter->batch[iter->end_sk++].sk = sk;
3206 			} else if (!*start_sk) {
3207 				/* Remember where we left off. */
3208 				*start_sk = sk;
3209 			}
3210 			expected++;
3211 		}
3212 	}
3213 
3214 	return expected;
3215 }
3216 
3217 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3218 						   struct sock **start_sk)
3219 {
3220 	struct bpf_tcp_iter_state *iter = seq->private;
3221 	struct hlist_nulls_node *node;
3222 	unsigned int expected = 1;
3223 	struct sock *sk;
3224 
3225 	sock_hold(*start_sk);
3226 	iter->batch[iter->end_sk++].sk = *start_sk;
3227 
3228 	sk = sk_nulls_next(*start_sk);
3229 	*start_sk = NULL;
3230 	sk_nulls_for_each_from(sk, node) {
3231 		if (seq_sk_match(seq, sk)) {
3232 			if (iter->end_sk < iter->max_sk) {
3233 				sock_hold(sk);
3234 				iter->batch[iter->end_sk++].sk = sk;
3235 			} else if (!*start_sk) {
3236 				/* Remember where we left off. */
3237 				*start_sk = sk;
3238 			}
3239 			expected++;
3240 		}
3241 	}
3242 
3243 	return expected;
3244 }
3245 
3246 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3247 					struct sock **start_sk)
3248 {
3249 	struct bpf_tcp_iter_state *iter = seq->private;
3250 	struct tcp_iter_state *st = &iter->state;
3251 
3252 	if (st->state == TCP_SEQ_STATE_LISTENING)
3253 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3254 	else
3255 		return bpf_iter_tcp_established_batch(seq, start_sk);
3256 }
3257 
3258 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3259 {
3260 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3261 	struct bpf_tcp_iter_state *iter = seq->private;
3262 	struct tcp_iter_state *st = &iter->state;
3263 
3264 	if (st->state == TCP_SEQ_STATE_LISTENING)
3265 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3266 	else
3267 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3268 }
3269 
3270 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3271 {
3272 	struct bpf_tcp_iter_state *iter = seq->private;
3273 	unsigned int expected;
3274 	struct sock *sk;
3275 	int err;
3276 
3277 	sk = bpf_iter_tcp_resume(seq);
3278 	if (!sk)
3279 		return NULL; /* Done */
3280 
3281 	expected = bpf_iter_fill_batch(seq, &sk);
3282 	if (likely(iter->end_sk == expected))
3283 		goto done;
3284 
3285 	/* Batch size was too small. */
3286 	bpf_iter_tcp_unlock_bucket(seq);
3287 	bpf_iter_tcp_put_batch(iter);
3288 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3289 					 GFP_USER);
3290 	if (err)
3291 		return ERR_PTR(err);
3292 
3293 	sk = bpf_iter_tcp_resume(seq);
3294 	if (!sk)
3295 		return NULL; /* Done */
3296 
3297 	expected = bpf_iter_fill_batch(seq, &sk);
3298 	if (likely(iter->end_sk == expected))
3299 		goto done;
3300 
3301 	/* Batch size was still too small. Hold onto the lock while we try
3302 	 * again with a larger batch to make sure the current bucket's size
3303 	 * does not change in the meantime.
3304 	 */
3305 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3306 	if (err) {
3307 		bpf_iter_tcp_unlock_bucket(seq);
3308 		return ERR_PTR(err);
3309 	}
3310 
3311 	expected = bpf_iter_fill_batch(seq, &sk);
3312 	WARN_ON_ONCE(iter->end_sk != expected);
3313 done:
3314 	bpf_iter_tcp_unlock_bucket(seq);
3315 	return iter->batch[0].sk;
3316 }
3317 
3318 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3319 {
3320 	/* bpf iter does not support lseek, so it always
3321 	 * continue from where it was stop()-ped.
3322 	 */
3323 	if (*pos)
3324 		return bpf_iter_tcp_batch(seq);
3325 
3326 	return SEQ_START_TOKEN;
3327 }
3328 
3329 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3330 {
3331 	struct bpf_tcp_iter_state *iter = seq->private;
3332 	struct tcp_iter_state *st = &iter->state;
3333 	struct sock *sk;
3334 
3335 	/* Whenever seq_next() is called, the iter->cur_sk is
3336 	 * done with seq_show(), so advance to the next sk in
3337 	 * the batch.
3338 	 */
3339 	if (iter->cur_sk < iter->end_sk) {
3340 		/* Keeping st->num consistent in tcp_iter_state.
3341 		 * bpf_iter_tcp does not use st->num.
3342 		 * meta.seq_num is used instead.
3343 		 */
3344 		st->num++;
3345 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3346 	}
3347 
3348 	if (iter->cur_sk < iter->end_sk)
3349 		sk = iter->batch[iter->cur_sk].sk;
3350 	else
3351 		sk = bpf_iter_tcp_batch(seq);
3352 
3353 	++*pos;
3354 	/* Keeping st->last_pos consistent in tcp_iter_state.
3355 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3356 	 */
3357 	st->last_pos = *pos;
3358 	return sk;
3359 }
3360 
3361 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3362 {
3363 	struct bpf_iter_meta meta;
3364 	struct bpf_prog *prog;
3365 	struct sock *sk = v;
3366 	uid_t uid;
3367 	int ret;
3368 
3369 	if (v == SEQ_START_TOKEN)
3370 		return 0;
3371 
3372 	if (sk_fullsock(sk))
3373 		lock_sock(sk);
3374 
3375 	if (unlikely(sk_unhashed(sk))) {
3376 		ret = SEQ_SKIP;
3377 		goto unlock;
3378 	}
3379 
3380 	if (sk->sk_state == TCP_TIME_WAIT) {
3381 		uid = 0;
3382 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3383 		const struct request_sock *req = v;
3384 
3385 		uid = from_kuid_munged(seq_user_ns(seq),
3386 				       sk_uid(req->rsk_listener));
3387 	} else {
3388 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3389 	}
3390 
3391 	meta.seq = seq;
3392 	prog = bpf_iter_get_info(&meta, false);
3393 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3394 
3395 unlock:
3396 	if (sk_fullsock(sk))
3397 		release_sock(sk);
3398 	return ret;
3399 
3400 }
3401 
3402 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3403 {
3404 	struct bpf_tcp_iter_state *iter = seq->private;
3405 	struct bpf_iter_meta meta;
3406 	struct bpf_prog *prog;
3407 
3408 	if (!v) {
3409 		meta.seq = seq;
3410 		prog = bpf_iter_get_info(&meta, true);
3411 		if (prog)
3412 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3413 	}
3414 
3415 	if (iter->cur_sk < iter->end_sk)
3416 		bpf_iter_tcp_put_batch(iter);
3417 }
3418 
3419 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3420 	.show		= bpf_iter_tcp_seq_show,
3421 	.start		= bpf_iter_tcp_seq_start,
3422 	.next		= bpf_iter_tcp_seq_next,
3423 	.stop		= bpf_iter_tcp_seq_stop,
3424 };
3425 #endif
3426 static unsigned short seq_file_family(const struct seq_file *seq)
3427 {
3428 	const struct tcp_seq_afinfo *afinfo;
3429 
3430 #ifdef CONFIG_BPF_SYSCALL
3431 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3432 	if (seq->op == &bpf_iter_tcp_seq_ops)
3433 		return AF_UNSPEC;
3434 #endif
3435 
3436 	/* Iterated from proc fs */
3437 	afinfo = pde_data(file_inode(seq->file));
3438 	return afinfo->family;
3439 }
3440 
3441 static const struct seq_operations tcp4_seq_ops = {
3442 	.show		= tcp4_seq_show,
3443 	.start		= tcp_seq_start,
3444 	.next		= tcp_seq_next,
3445 	.stop		= tcp_seq_stop,
3446 };
3447 
3448 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3449 	.family		= AF_INET,
3450 };
3451 
3452 static int __net_init tcp4_proc_init_net(struct net *net)
3453 {
3454 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3455 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3456 		return -ENOMEM;
3457 	return 0;
3458 }
3459 
3460 static void __net_exit tcp4_proc_exit_net(struct net *net)
3461 {
3462 	remove_proc_entry("tcp", net->proc_net);
3463 }
3464 
3465 static struct pernet_operations tcp4_net_ops = {
3466 	.init = tcp4_proc_init_net,
3467 	.exit = tcp4_proc_exit_net,
3468 };
3469 
3470 int __init tcp4_proc_init(void)
3471 {
3472 	return register_pernet_subsys(&tcp4_net_ops);
3473 }
3474 
3475 void tcp4_proc_exit(void)
3476 {
3477 	unregister_pernet_subsys(&tcp4_net_ops);
3478 }
3479 #endif /* CONFIG_PROC_FS */
3480 
3481 /* @wake is one when sk_stream_write_space() calls us.
3482  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3483  * This mimics the strategy used in sock_def_write_space().
3484  */
3485 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3486 {
3487 	const struct tcp_sock *tp = tcp_sk(sk);
3488 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3489 			    READ_ONCE(tp->snd_nxt);
3490 
3491 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3492 }
3493 EXPORT_SYMBOL(tcp_stream_memory_free);
3494 
3495 struct proto tcp_prot = {
3496 	.name			= "TCP",
3497 	.owner			= THIS_MODULE,
3498 	.close			= tcp_close,
3499 	.pre_connect		= tcp_v4_pre_connect,
3500 	.connect		= tcp_v4_connect,
3501 	.disconnect		= tcp_disconnect,
3502 	.accept			= inet_csk_accept,
3503 	.ioctl			= tcp_ioctl,
3504 	.init			= tcp_v4_init_sock,
3505 	.destroy		= tcp_v4_destroy_sock,
3506 	.shutdown		= tcp_shutdown,
3507 	.setsockopt		= tcp_setsockopt,
3508 	.getsockopt		= tcp_getsockopt,
3509 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3510 	.keepalive		= tcp_set_keepalive,
3511 	.recvmsg		= tcp_recvmsg,
3512 	.sendmsg		= tcp_sendmsg,
3513 	.splice_eof		= tcp_splice_eof,
3514 	.backlog_rcv		= tcp_v4_do_rcv,
3515 	.release_cb		= tcp_release_cb,
3516 	.hash			= inet_hash,
3517 	.unhash			= inet_unhash,
3518 	.get_port		= inet_csk_get_port,
3519 	.put_port		= inet_put_port,
3520 #ifdef CONFIG_BPF_SYSCALL
3521 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3522 #endif
3523 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3524 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3525 	.stream_memory_free	= tcp_stream_memory_free,
3526 	.sockets_allocated	= &tcp_sockets_allocated,
3527 	.orphan_count		= &tcp_orphan_count,
3528 
3529 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3530 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3531 
3532 	.memory_pressure	= &tcp_memory_pressure,
3533 	.sysctl_mem		= sysctl_tcp_mem,
3534 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3535 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3536 	.max_header		= MAX_TCP_HEADER,
3537 	.obj_size		= sizeof(struct tcp_sock),
3538 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3539 	.twsk_prot		= &tcp_timewait_sock_ops,
3540 	.rsk_prot		= &tcp_request_sock_ops,
3541 	.h.hashinfo		= NULL,
3542 	.no_autobind		= true,
3543 	.diag_destroy		= tcp_abort,
3544 };
3545 EXPORT_SYMBOL(tcp_prot);
3546 
3547 static void __net_exit tcp_sk_exit(struct net *net)
3548 {
3549 	if (net->ipv4.tcp_congestion_control)
3550 		bpf_module_put(net->ipv4.tcp_congestion_control,
3551 			       net->ipv4.tcp_congestion_control->owner);
3552 }
3553 
3554 static void __net_init tcp_set_hashinfo(struct net *net)
3555 {
3556 	struct inet_hashinfo *hinfo;
3557 	unsigned int ehash_entries;
3558 	struct net *old_net;
3559 
3560 	if (net_eq(net, &init_net))
3561 		goto fallback;
3562 
3563 	old_net = current->nsproxy->net_ns;
3564 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3565 	if (!ehash_entries)
3566 		goto fallback;
3567 
3568 	ehash_entries = roundup_pow_of_two(ehash_entries);
3569 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3570 	if (!hinfo) {
3571 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3572 			"for a netns, fallback to the global one\n",
3573 			ehash_entries);
3574 fallback:
3575 		hinfo = &tcp_hashinfo;
3576 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3577 	}
3578 
3579 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3580 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3581 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3582 }
3583 
3584 static int __net_init tcp_sk_init(struct net *net)
3585 {
3586 	net->ipv4.sysctl_tcp_ecn = 2;
3587 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3588 
3589 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3590 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3591 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3592 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3593 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3594 
3595 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3596 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3597 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3598 
3599 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3600 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3601 	net->ipv4.sysctl_tcp_syncookies = 1;
3602 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3603 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3604 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3605 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3606 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3607 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3608 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3609 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3610 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3611 
3612 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3613 	tcp_set_hashinfo(net);
3614 
3615 	net->ipv4.sysctl_tcp_sack = 1;
3616 	net->ipv4.sysctl_tcp_window_scaling = 1;
3617 	net->ipv4.sysctl_tcp_timestamps = 1;
3618 	net->ipv4.sysctl_tcp_early_retrans = 3;
3619 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3620 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3621 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3622 	net->ipv4.sysctl_tcp_max_reordering = 300;
3623 	net->ipv4.sysctl_tcp_dsack = 1;
3624 	net->ipv4.sysctl_tcp_app_win = 31;
3625 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3626 	net->ipv4.sysctl_tcp_frto = 2;
3627 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3628 	/* This limits the percentage of the congestion window which we
3629 	 * will allow a single TSO frame to consume.  Building TSO frames
3630 	 * which are too large can cause TCP streams to be bursty.
3631 	 */
3632 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3633 	/* Default TSQ limit of 4 MB */
3634 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3635 
3636 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3637 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3638 
3639 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3640 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3641 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3642 	net->ipv4.sysctl_tcp_autocorking = 1;
3643 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3644 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3645 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3646 	if (net != &init_net) {
3647 		memcpy(net->ipv4.sysctl_tcp_rmem,
3648 		       init_net.ipv4.sysctl_tcp_rmem,
3649 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3650 		memcpy(net->ipv4.sysctl_tcp_wmem,
3651 		       init_net.ipv4.sysctl_tcp_wmem,
3652 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3653 	}
3654 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3655 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3656 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3657 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3658 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3659 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3660 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3661 
3662 	/* Set default values for PLB */
3663 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3664 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3665 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3666 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3667 	/* Default congestion threshold for PLB to mark a round is 50% */
3668 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3669 
3670 	/* Reno is always built in */
3671 	if (!net_eq(net, &init_net) &&
3672 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3673 			       init_net.ipv4.tcp_congestion_control->owner))
3674 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3675 	else
3676 		net->ipv4.tcp_congestion_control = &tcp_reno;
3677 
3678 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3679 	net->ipv4.sysctl_tcp_shrink_window = 0;
3680 
3681 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3682 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3683 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3684 
3685 	return 0;
3686 }
3687 
3688 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3689 {
3690 	struct net *net;
3691 
3692 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3693 	 * and failed setup_net error unwinding path are serialized.
3694 	 *
3695 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3696 	 * net_exit_list, the thread that dismantles a particular twsk must
3697 	 * do so without other thread progressing to refcount_dec_and_test() of
3698 	 * tcp_death_row.tw_refcount.
3699 	 */
3700 	mutex_lock(&tcp_exit_batch_mutex);
3701 
3702 	tcp_twsk_purge(net_exit_list);
3703 
3704 	list_for_each_entry(net, net_exit_list, exit_list) {
3705 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3706 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3707 		tcp_fastopen_ctx_destroy(net);
3708 	}
3709 
3710 	mutex_unlock(&tcp_exit_batch_mutex);
3711 }
3712 
3713 static struct pernet_operations __net_initdata tcp_sk_ops = {
3714        .init	   = tcp_sk_init,
3715        .exit	   = tcp_sk_exit,
3716        .exit_batch = tcp_sk_exit_batch,
3717 };
3718 
3719 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3720 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3721 		     struct sock_common *sk_common, uid_t uid)
3722 
3723 #define INIT_BATCH_SZ 16
3724 
3725 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3726 {
3727 	struct bpf_tcp_iter_state *iter = priv_data;
3728 	int err;
3729 
3730 	err = bpf_iter_init_seq_net(priv_data, aux);
3731 	if (err)
3732 		return err;
3733 
3734 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3735 	if (err) {
3736 		bpf_iter_fini_seq_net(priv_data);
3737 		return err;
3738 	}
3739 
3740 	return 0;
3741 }
3742 
3743 static void bpf_iter_fini_tcp(void *priv_data)
3744 {
3745 	struct bpf_tcp_iter_state *iter = priv_data;
3746 
3747 	bpf_iter_fini_seq_net(priv_data);
3748 	kvfree(iter->batch);
3749 }
3750 
3751 static const struct bpf_iter_seq_info tcp_seq_info = {
3752 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3753 	.init_seq_private	= bpf_iter_init_tcp,
3754 	.fini_seq_private	= bpf_iter_fini_tcp,
3755 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3756 };
3757 
3758 static const struct bpf_func_proto *
3759 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3760 			    const struct bpf_prog *prog)
3761 {
3762 	switch (func_id) {
3763 	case BPF_FUNC_setsockopt:
3764 		return &bpf_sk_setsockopt_proto;
3765 	case BPF_FUNC_getsockopt:
3766 		return &bpf_sk_getsockopt_proto;
3767 	default:
3768 		return NULL;
3769 	}
3770 }
3771 
3772 static struct bpf_iter_reg tcp_reg_info = {
3773 	.target			= "tcp",
3774 	.ctx_arg_info_size	= 1,
3775 	.ctx_arg_info		= {
3776 		{ offsetof(struct bpf_iter__tcp, sk_common),
3777 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3778 	},
3779 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3780 	.seq_info		= &tcp_seq_info,
3781 };
3782 
3783 static void __init bpf_iter_register(void)
3784 {
3785 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3786 	if (bpf_iter_reg_target(&tcp_reg_info))
3787 		pr_warn("Warning: could not register bpf iterator tcp\n");
3788 }
3789 
3790 #endif
3791 
3792 void __init tcp_v4_init(void)
3793 {
3794 	int cpu, res;
3795 
3796 	for_each_possible_cpu(cpu) {
3797 		struct sock *sk;
3798 
3799 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3800 					   IPPROTO_TCP, &init_net);
3801 		if (res)
3802 			panic("Failed to create the TCP control socket.\n");
3803 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3804 
3805 		/* Please enforce IP_DF and IPID==0 for RST and
3806 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3807 		 */
3808 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3809 
3810 		sk->sk_clockid = CLOCK_MONOTONIC;
3811 
3812 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3813 	}
3814 	if (register_pernet_subsys(&tcp_sk_ops))
3815 		panic("Failed to create the TCP control socket.\n");
3816 
3817 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3818 	bpf_iter_register();
3819 #endif
3820 }
3821