xref: /linux/net/ipv4/tcp_ipv4.c (revision e3b9626f09d429788d929c9b9000a069fcfc056e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	ip_rt_put(rt);
326 	sk->sk_route_caps = 0;
327 	inet->inet_dport = 0;
328 	return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331 
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 	struct inet_sock *inet = inet_sk(sk);
340 	struct dst_entry *dst;
341 	u32 mtu;
342 
343 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 		return;
345 	mtu = tcp_sk(sk)->mtu_info;
346 	dst = inet_csk_update_pmtu(sk, mtu);
347 	if (!dst)
348 		return;
349 
350 	/* Something is about to be wrong... Remember soft error
351 	 * for the case, if this connection will not able to recover.
352 	 */
353 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 		sk->sk_err_soft = EMSGSIZE;
355 
356 	mtu = dst_mtu(dst);
357 
358 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 	    ip_sk_accept_pmtu(sk) &&
360 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 		tcp_sync_mss(sk, mtu);
362 
363 		/* Resend the TCP packet because it's
364 		 * clear that the old packet has been
365 		 * dropped. This is the new "fast" path mtu
366 		 * discovery.
367 		 */
368 		tcp_simple_retransmit(sk);
369 	} /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 	struct dst_entry *dst = __sk_dst_check(sk, 0);
376 
377 	if (dst)
378 		dst->ops->redirect(dst, sk, skb);
379 }
380 
381 
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 	struct request_sock *req = inet_reqsk(sk);
386 	struct net *net = sock_net(sk);
387 
388 	/* ICMPs are not backlogged, hence we cannot get
389 	 * an established socket here.
390 	 */
391 	if (seq != tcp_rsk(req)->snt_isn) {
392 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 	} else if (abort) {
394 		/*
395 		 * Still in SYN_RECV, just remove it silently.
396 		 * There is no good way to pass the error to the newly
397 		 * created socket, and POSIX does not want network
398 		 * errors returned from accept().
399 		 */
400 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 		tcp_listendrop(req->rsk_listener);
402 	}
403 	reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406 
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 	struct inet_connection_sock *icsk = inet_csk(sk);
411 	struct tcp_sock *tp = tcp_sk(sk);
412 	struct sk_buff *skb;
413 	s32 remaining;
414 	u32 delta_us;
415 
416 	if (sock_owned_by_user(sk))
417 		return;
418 
419 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420 	    !icsk->icsk_backoff)
421 		return;
422 
423 	skb = tcp_rtx_queue_head(sk);
424 	if (WARN_ON_ONCE(!skb))
425 		return;
426 
427 	icsk->icsk_backoff--;
428 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 
431 	tcp_mstamp_refresh(tp);
432 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 
435 	if (remaining > 0) {
436 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 					  remaining, TCP_RTO_MAX);
438 	} else {
439 		/* RTO revert clocked out retransmission.
440 		 * Will retransmit now.
441 		 */
442 		tcp_retransmit_timer(sk);
443 	}
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462 
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 	const struct iphdr *iph = (const struct iphdr *)skb->data;
466 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 	struct tcp_sock *tp;
468 	struct inet_sock *inet;
469 	const int type = icmp_hdr(skb)->type;
470 	const int code = icmp_hdr(skb)->code;
471 	struct sock *sk;
472 	struct request_sock *fastopen;
473 	u32 seq, snd_una;
474 	int err;
475 	struct net *net = dev_net(skb->dev);
476 
477 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 				       th->dest, iph->saddr, ntohs(th->source),
479 				       inet_iif(skb), 0);
480 	if (!sk) {
481 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 		return -ENOENT;
483 	}
484 	if (sk->sk_state == TCP_TIME_WAIT) {
485 		inet_twsk_put(inet_twsk(sk));
486 		return 0;
487 	}
488 	seq = ntohl(th->seq);
489 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 				     type == ICMP_TIME_EXCEEDED ||
492 				     (type == ICMP_DEST_UNREACH &&
493 				      (code == ICMP_NET_UNREACH ||
494 				       code == ICMP_HOST_UNREACH)));
495 		return 0;
496 	}
497 
498 	bh_lock_sock(sk);
499 	/* If too many ICMPs get dropped on busy
500 	 * servers this needs to be solved differently.
501 	 * We do take care of PMTU discovery (RFC1191) special case :
502 	 * we can receive locally generated ICMP messages while socket is held.
503 	 */
504 	if (sock_owned_by_user(sk)) {
505 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 	}
508 	if (sk->sk_state == TCP_CLOSE)
509 		goto out;
510 
511 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 		goto out;
514 	}
515 
516 	tp = tcp_sk(sk);
517 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 	fastopen = rcu_dereference(tp->fastopen_rsk);
519 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 	if (sk->sk_state != TCP_LISTEN &&
521 	    !between(seq, snd_una, tp->snd_nxt)) {
522 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 		goto out;
524 	}
525 
526 	switch (type) {
527 	case ICMP_REDIRECT:
528 		if (!sock_owned_by_user(sk))
529 			do_redirect(skb, sk);
530 		goto out;
531 	case ICMP_SOURCE_QUENCH:
532 		/* Just silently ignore these. */
533 		goto out;
534 	case ICMP_PARAMETERPROB:
535 		err = EPROTO;
536 		break;
537 	case ICMP_DEST_UNREACH:
538 		if (code > NR_ICMP_UNREACH)
539 			goto out;
540 
541 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 			/* We are not interested in TCP_LISTEN and open_requests
543 			 * (SYN-ACKs send out by Linux are always <576bytes so
544 			 * they should go through unfragmented).
545 			 */
546 			if (sk->sk_state == TCP_LISTEN)
547 				goto out;
548 
549 			tp->mtu_info = info;
550 			if (!sock_owned_by_user(sk)) {
551 				tcp_v4_mtu_reduced(sk);
552 			} else {
553 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 					sock_hold(sk);
555 			}
556 			goto out;
557 		}
558 
559 		err = icmp_err_convert[code].errno;
560 		/* check if this ICMP message allows revert of backoff.
561 		 * (see RFC 6069)
562 		 */
563 		if (!fastopen &&
564 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 			tcp_ld_RTO_revert(sk, seq);
566 		break;
567 	case ICMP_TIME_EXCEEDED:
568 		err = EHOSTUNREACH;
569 		break;
570 	default:
571 		goto out;
572 	}
573 
574 	switch (sk->sk_state) {
575 	case TCP_SYN_SENT:
576 	case TCP_SYN_RECV:
577 		/* Only in fast or simultaneous open. If a fast open socket is
578 		 * already accepted it is treated as a connected one below.
579 		 */
580 		if (fastopen && !fastopen->sk)
581 			break;
582 
583 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 
585 		if (!sock_owned_by_user(sk)) {
586 			sk->sk_err = err;
587 
588 			sk->sk_error_report(sk);
589 
590 			tcp_done(sk);
591 		} else {
592 			sk->sk_err_soft = err;
593 		}
594 		goto out;
595 	}
596 
597 	/* If we've already connected we will keep trying
598 	 * until we time out, or the user gives up.
599 	 *
600 	 * rfc1122 4.2.3.9 allows to consider as hard errors
601 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 	 * but it is obsoleted by pmtu discovery).
603 	 *
604 	 * Note, that in modern internet, where routing is unreliable
605 	 * and in each dark corner broken firewalls sit, sending random
606 	 * errors ordered by their masters even this two messages finally lose
607 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 	 *
609 	 * Now we are in compliance with RFCs.
610 	 *							--ANK (980905)
611 	 */
612 
613 	inet = inet_sk(sk);
614 	if (!sock_owned_by_user(sk) && inet->recverr) {
615 		sk->sk_err = err;
616 		sk->sk_error_report(sk);
617 	} else	{ /* Only an error on timeout */
618 		sk->sk_err_soft = err;
619 	}
620 
621 out:
622 	bh_unlock_sock(sk);
623 	sock_put(sk);
624 	return 0;
625 }
626 
627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 {
629 	struct tcphdr *th = tcp_hdr(skb);
630 
631 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 	skb->csum_start = skb_transport_header(skb) - skb->head;
633 	skb->csum_offset = offsetof(struct tcphdr, check);
634 }
635 
636 /* This routine computes an IPv4 TCP checksum. */
637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 {
639 	const struct inet_sock *inet = inet_sk(sk);
640 
641 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 }
643 EXPORT_SYMBOL(tcp_v4_send_check);
644 
645 /*
646  *	This routine will send an RST to the other tcp.
647  *
648  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649  *		      for reset.
650  *	Answer: if a packet caused RST, it is not for a socket
651  *		existing in our system, if it is matched to a socket,
652  *		it is just duplicate segment or bug in other side's TCP.
653  *		So that we build reply only basing on parameters
654  *		arrived with segment.
655  *	Exception: precedence violation. We do not implement it in any case.
656  */
657 
658 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
659 {
660 	const struct tcphdr *th = tcp_hdr(skb);
661 	struct {
662 		struct tcphdr th;
663 #ifdef CONFIG_TCP_MD5SIG
664 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
665 #endif
666 	} rep;
667 	struct ip_reply_arg arg;
668 #ifdef CONFIG_TCP_MD5SIG
669 	struct tcp_md5sig_key *key = NULL;
670 	const __u8 *hash_location = NULL;
671 	unsigned char newhash[16];
672 	int genhash;
673 	struct sock *sk1 = NULL;
674 #endif
675 	u64 transmit_time = 0;
676 	struct sock *ctl_sk;
677 	struct net *net;
678 
679 	/* Never send a reset in response to a reset. */
680 	if (th->rst)
681 		return;
682 
683 	/* If sk not NULL, it means we did a successful lookup and incoming
684 	 * route had to be correct. prequeue might have dropped our dst.
685 	 */
686 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
687 		return;
688 
689 	/* Swap the send and the receive. */
690 	memset(&rep, 0, sizeof(rep));
691 	rep.th.dest   = th->source;
692 	rep.th.source = th->dest;
693 	rep.th.doff   = sizeof(struct tcphdr) / 4;
694 	rep.th.rst    = 1;
695 
696 	if (th->ack) {
697 		rep.th.seq = th->ack_seq;
698 	} else {
699 		rep.th.ack = 1;
700 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
701 				       skb->len - (th->doff << 2));
702 	}
703 
704 	memset(&arg, 0, sizeof(arg));
705 	arg.iov[0].iov_base = (unsigned char *)&rep;
706 	arg.iov[0].iov_len  = sizeof(rep.th);
707 
708 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
709 #ifdef CONFIG_TCP_MD5SIG
710 	rcu_read_lock();
711 	hash_location = tcp_parse_md5sig_option(th);
712 	if (sk && sk_fullsock(sk)) {
713 		const union tcp_md5_addr *addr;
714 		int l3index;
715 
716 		/* sdif set, means packet ingressed via a device
717 		 * in an L3 domain and inet_iif is set to it.
718 		 */
719 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
720 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
721 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
722 	} else if (hash_location) {
723 		const union tcp_md5_addr *addr;
724 		int sdif = tcp_v4_sdif(skb);
725 		int dif = inet_iif(skb);
726 		int l3index;
727 
728 		/*
729 		 * active side is lost. Try to find listening socket through
730 		 * source port, and then find md5 key through listening socket.
731 		 * we are not loose security here:
732 		 * Incoming packet is checked with md5 hash with finding key,
733 		 * no RST generated if md5 hash doesn't match.
734 		 */
735 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
736 					     ip_hdr(skb)->saddr,
737 					     th->source, ip_hdr(skb)->daddr,
738 					     ntohs(th->source), dif, sdif);
739 		/* don't send rst if it can't find key */
740 		if (!sk1)
741 			goto out;
742 
743 		/* sdif set, means packet ingressed via a device
744 		 * in an L3 domain and dif is set to it.
745 		 */
746 		l3index = sdif ? dif : 0;
747 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
748 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
749 		if (!key)
750 			goto out;
751 
752 
753 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
754 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
755 			goto out;
756 
757 	}
758 
759 	if (key) {
760 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
761 				   (TCPOPT_NOP << 16) |
762 				   (TCPOPT_MD5SIG << 8) |
763 				   TCPOLEN_MD5SIG);
764 		/* Update length and the length the header thinks exists */
765 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
766 		rep.th.doff = arg.iov[0].iov_len / 4;
767 
768 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
769 				     key, ip_hdr(skb)->saddr,
770 				     ip_hdr(skb)->daddr, &rep.th);
771 	}
772 #endif
773 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
774 				      ip_hdr(skb)->saddr, /* XXX */
775 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
776 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
777 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
778 
779 	/* When socket is gone, all binding information is lost.
780 	 * routing might fail in this case. No choice here, if we choose to force
781 	 * input interface, we will misroute in case of asymmetric route.
782 	 */
783 	if (sk) {
784 		arg.bound_dev_if = sk->sk_bound_dev_if;
785 		if (sk_fullsock(sk))
786 			trace_tcp_send_reset(sk, skb);
787 	}
788 
789 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
790 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
791 
792 	arg.tos = ip_hdr(skb)->tos;
793 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
794 	local_bh_disable();
795 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
796 	if (sk) {
797 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
798 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
799 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
800 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
801 		transmit_time = tcp_transmit_time(sk);
802 	}
803 	ip_send_unicast_reply(ctl_sk,
804 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
805 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
806 			      &arg, arg.iov[0].iov_len,
807 			      transmit_time);
808 
809 	ctl_sk->sk_mark = 0;
810 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
811 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
812 	local_bh_enable();
813 
814 #ifdef CONFIG_TCP_MD5SIG
815 out:
816 	rcu_read_unlock();
817 #endif
818 }
819 
820 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
821    outside socket context is ugly, certainly. What can I do?
822  */
823 
824 static void tcp_v4_send_ack(const struct sock *sk,
825 			    struct sk_buff *skb, u32 seq, u32 ack,
826 			    u32 win, u32 tsval, u32 tsecr, int oif,
827 			    struct tcp_md5sig_key *key,
828 			    int reply_flags, u8 tos)
829 {
830 	const struct tcphdr *th = tcp_hdr(skb);
831 	struct {
832 		struct tcphdr th;
833 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
834 #ifdef CONFIG_TCP_MD5SIG
835 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
836 #endif
837 			];
838 	} rep;
839 	struct net *net = sock_net(sk);
840 	struct ip_reply_arg arg;
841 	struct sock *ctl_sk;
842 	u64 transmit_time;
843 
844 	memset(&rep.th, 0, sizeof(struct tcphdr));
845 	memset(&arg, 0, sizeof(arg));
846 
847 	arg.iov[0].iov_base = (unsigned char *)&rep;
848 	arg.iov[0].iov_len  = sizeof(rep.th);
849 	if (tsecr) {
850 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
851 				   (TCPOPT_TIMESTAMP << 8) |
852 				   TCPOLEN_TIMESTAMP);
853 		rep.opt[1] = htonl(tsval);
854 		rep.opt[2] = htonl(tsecr);
855 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
856 	}
857 
858 	/* Swap the send and the receive. */
859 	rep.th.dest    = th->source;
860 	rep.th.source  = th->dest;
861 	rep.th.doff    = arg.iov[0].iov_len / 4;
862 	rep.th.seq     = htonl(seq);
863 	rep.th.ack_seq = htonl(ack);
864 	rep.th.ack     = 1;
865 	rep.th.window  = htons(win);
866 
867 #ifdef CONFIG_TCP_MD5SIG
868 	if (key) {
869 		int offset = (tsecr) ? 3 : 0;
870 
871 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
872 					  (TCPOPT_NOP << 16) |
873 					  (TCPOPT_MD5SIG << 8) |
874 					  TCPOLEN_MD5SIG);
875 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
876 		rep.th.doff = arg.iov[0].iov_len/4;
877 
878 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
879 				    key, ip_hdr(skb)->saddr,
880 				    ip_hdr(skb)->daddr, &rep.th);
881 	}
882 #endif
883 	arg.flags = reply_flags;
884 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
885 				      ip_hdr(skb)->saddr, /* XXX */
886 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
887 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
888 	if (oif)
889 		arg.bound_dev_if = oif;
890 	arg.tos = tos;
891 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
892 	local_bh_disable();
893 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
894 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
895 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
896 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
897 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
898 	transmit_time = tcp_transmit_time(sk);
899 	ip_send_unicast_reply(ctl_sk,
900 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
901 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
902 			      &arg, arg.iov[0].iov_len,
903 			      transmit_time);
904 
905 	ctl_sk->sk_mark = 0;
906 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
907 	local_bh_enable();
908 }
909 
910 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
911 {
912 	struct inet_timewait_sock *tw = inet_twsk(sk);
913 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
914 
915 	tcp_v4_send_ack(sk, skb,
916 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
917 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
918 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
919 			tcptw->tw_ts_recent,
920 			tw->tw_bound_dev_if,
921 			tcp_twsk_md5_key(tcptw),
922 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
923 			tw->tw_tos
924 			);
925 
926 	inet_twsk_put(tw);
927 }
928 
929 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
930 				  struct request_sock *req)
931 {
932 	const union tcp_md5_addr *addr;
933 	int l3index;
934 
935 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
936 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
937 	 */
938 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
939 					     tcp_sk(sk)->snd_nxt;
940 
941 	/* RFC 7323 2.3
942 	 * The window field (SEG.WND) of every outgoing segment, with the
943 	 * exception of <SYN> segments, MUST be right-shifted by
944 	 * Rcv.Wind.Shift bits:
945 	 */
946 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
947 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
948 	tcp_v4_send_ack(sk, skb, seq,
949 			tcp_rsk(req)->rcv_nxt,
950 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
951 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
952 			req->ts_recent,
953 			0,
954 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
955 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
956 			ip_hdr(skb)->tos);
957 }
958 
959 /*
960  *	Send a SYN-ACK after having received a SYN.
961  *	This still operates on a request_sock only, not on a big
962  *	socket.
963  */
964 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
965 			      struct flowi *fl,
966 			      struct request_sock *req,
967 			      struct tcp_fastopen_cookie *foc,
968 			      enum tcp_synack_type synack_type,
969 			      struct sk_buff *syn_skb)
970 {
971 	const struct inet_request_sock *ireq = inet_rsk(req);
972 	struct flowi4 fl4;
973 	int err = -1;
974 	struct sk_buff *skb;
975 
976 	/* First, grab a route. */
977 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
978 		return -1;
979 
980 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
981 
982 	if (skb) {
983 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
984 
985 		rcu_read_lock();
986 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
987 					    ireq->ir_rmt_addr,
988 					    rcu_dereference(ireq->ireq_opt));
989 		rcu_read_unlock();
990 		err = net_xmit_eval(err);
991 	}
992 
993 	return err;
994 }
995 
996 /*
997  *	IPv4 request_sock destructor.
998  */
999 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1000 {
1001 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1002 }
1003 
1004 #ifdef CONFIG_TCP_MD5SIG
1005 /*
1006  * RFC2385 MD5 checksumming requires a mapping of
1007  * IP address->MD5 Key.
1008  * We need to maintain these in the sk structure.
1009  */
1010 
1011 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1012 EXPORT_SYMBOL(tcp_md5_needed);
1013 
1014 /* Find the Key structure for an address.  */
1015 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1016 					   const union tcp_md5_addr *addr,
1017 					   int family)
1018 {
1019 	const struct tcp_sock *tp = tcp_sk(sk);
1020 	struct tcp_md5sig_key *key;
1021 	const struct tcp_md5sig_info *md5sig;
1022 	__be32 mask;
1023 	struct tcp_md5sig_key *best_match = NULL;
1024 	bool match;
1025 
1026 	/* caller either holds rcu_read_lock() or socket lock */
1027 	md5sig = rcu_dereference_check(tp->md5sig_info,
1028 				       lockdep_sock_is_held(sk));
1029 	if (!md5sig)
1030 		return NULL;
1031 
1032 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1033 				 lockdep_sock_is_held(sk)) {
1034 		if (key->family != family)
1035 			continue;
1036 		if (key->l3index && key->l3index != l3index)
1037 			continue;
1038 		if (family == AF_INET) {
1039 			mask = inet_make_mask(key->prefixlen);
1040 			match = (key->addr.a4.s_addr & mask) ==
1041 				(addr->a4.s_addr & mask);
1042 #if IS_ENABLED(CONFIG_IPV6)
1043 		} else if (family == AF_INET6) {
1044 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1045 						  key->prefixlen);
1046 #endif
1047 		} else {
1048 			match = false;
1049 		}
1050 
1051 		if (match && (!best_match ||
1052 			      key->prefixlen > best_match->prefixlen))
1053 			best_match = key;
1054 	}
1055 	return best_match;
1056 }
1057 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1058 
1059 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1060 						      const union tcp_md5_addr *addr,
1061 						      int family, u8 prefixlen,
1062 						      int l3index)
1063 {
1064 	const struct tcp_sock *tp = tcp_sk(sk);
1065 	struct tcp_md5sig_key *key;
1066 	unsigned int size = sizeof(struct in_addr);
1067 	const struct tcp_md5sig_info *md5sig;
1068 
1069 	/* caller either holds rcu_read_lock() or socket lock */
1070 	md5sig = rcu_dereference_check(tp->md5sig_info,
1071 				       lockdep_sock_is_held(sk));
1072 	if (!md5sig)
1073 		return NULL;
1074 #if IS_ENABLED(CONFIG_IPV6)
1075 	if (family == AF_INET6)
1076 		size = sizeof(struct in6_addr);
1077 #endif
1078 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1079 				 lockdep_sock_is_held(sk)) {
1080 		if (key->family != family)
1081 			continue;
1082 		if (key->l3index && key->l3index != l3index)
1083 			continue;
1084 		if (!memcmp(&key->addr, addr, size) &&
1085 		    key->prefixlen == prefixlen)
1086 			return key;
1087 	}
1088 	return NULL;
1089 }
1090 
1091 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1092 					 const struct sock *addr_sk)
1093 {
1094 	const union tcp_md5_addr *addr;
1095 	int l3index;
1096 
1097 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1098 						 addr_sk->sk_bound_dev_if);
1099 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1100 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1101 }
1102 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1103 
1104 /* This can be called on a newly created socket, from other files */
1105 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1106 		   int family, u8 prefixlen, int l3index,
1107 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1108 {
1109 	/* Add Key to the list */
1110 	struct tcp_md5sig_key *key;
1111 	struct tcp_sock *tp = tcp_sk(sk);
1112 	struct tcp_md5sig_info *md5sig;
1113 
1114 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1115 	if (key) {
1116 		/* Pre-existing entry - just update that one.
1117 		 * Note that the key might be used concurrently.
1118 		 * data_race() is telling kcsan that we do not care of
1119 		 * key mismatches, since changing MD5 key on live flows
1120 		 * can lead to packet drops.
1121 		 */
1122 		data_race(memcpy(key->key, newkey, newkeylen));
1123 
1124 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1125 		 * Also note that a reader could catch new key->keylen value
1126 		 * but old key->key[], this is the reason we use __GFP_ZERO
1127 		 * at sock_kmalloc() time below these lines.
1128 		 */
1129 		WRITE_ONCE(key->keylen, newkeylen);
1130 
1131 		return 0;
1132 	}
1133 
1134 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1135 					   lockdep_sock_is_held(sk));
1136 	if (!md5sig) {
1137 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1138 		if (!md5sig)
1139 			return -ENOMEM;
1140 
1141 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1142 		INIT_HLIST_HEAD(&md5sig->head);
1143 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1144 	}
1145 
1146 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1147 	if (!key)
1148 		return -ENOMEM;
1149 	if (!tcp_alloc_md5sig_pool()) {
1150 		sock_kfree_s(sk, key, sizeof(*key));
1151 		return -ENOMEM;
1152 	}
1153 
1154 	memcpy(key->key, newkey, newkeylen);
1155 	key->keylen = newkeylen;
1156 	key->family = family;
1157 	key->prefixlen = prefixlen;
1158 	key->l3index = l3index;
1159 	memcpy(&key->addr, addr,
1160 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1161 				      sizeof(struct in_addr));
1162 	hlist_add_head_rcu(&key->node, &md5sig->head);
1163 	return 0;
1164 }
1165 EXPORT_SYMBOL(tcp_md5_do_add);
1166 
1167 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1168 		   u8 prefixlen, int l3index)
1169 {
1170 	struct tcp_md5sig_key *key;
1171 
1172 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1173 	if (!key)
1174 		return -ENOENT;
1175 	hlist_del_rcu(&key->node);
1176 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1177 	kfree_rcu(key, rcu);
1178 	return 0;
1179 }
1180 EXPORT_SYMBOL(tcp_md5_do_del);
1181 
1182 static void tcp_clear_md5_list(struct sock *sk)
1183 {
1184 	struct tcp_sock *tp = tcp_sk(sk);
1185 	struct tcp_md5sig_key *key;
1186 	struct hlist_node *n;
1187 	struct tcp_md5sig_info *md5sig;
1188 
1189 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1190 
1191 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1192 		hlist_del_rcu(&key->node);
1193 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1194 		kfree_rcu(key, rcu);
1195 	}
1196 }
1197 
1198 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1199 				 sockptr_t optval, int optlen)
1200 {
1201 	struct tcp_md5sig cmd;
1202 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1203 	const union tcp_md5_addr *addr;
1204 	u8 prefixlen = 32;
1205 	int l3index = 0;
1206 
1207 	if (optlen < sizeof(cmd))
1208 		return -EINVAL;
1209 
1210 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1211 		return -EFAULT;
1212 
1213 	if (sin->sin_family != AF_INET)
1214 		return -EINVAL;
1215 
1216 	if (optname == TCP_MD5SIG_EXT &&
1217 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1218 		prefixlen = cmd.tcpm_prefixlen;
1219 		if (prefixlen > 32)
1220 			return -EINVAL;
1221 	}
1222 
1223 	if (optname == TCP_MD5SIG_EXT &&
1224 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1225 		struct net_device *dev;
1226 
1227 		rcu_read_lock();
1228 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1229 		if (dev && netif_is_l3_master(dev))
1230 			l3index = dev->ifindex;
1231 
1232 		rcu_read_unlock();
1233 
1234 		/* ok to reference set/not set outside of rcu;
1235 		 * right now device MUST be an L3 master
1236 		 */
1237 		if (!dev || !l3index)
1238 			return -EINVAL;
1239 	}
1240 
1241 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1242 
1243 	if (!cmd.tcpm_keylen)
1244 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1245 
1246 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1247 		return -EINVAL;
1248 
1249 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1250 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1251 }
1252 
1253 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1254 				   __be32 daddr, __be32 saddr,
1255 				   const struct tcphdr *th, int nbytes)
1256 {
1257 	struct tcp4_pseudohdr *bp;
1258 	struct scatterlist sg;
1259 	struct tcphdr *_th;
1260 
1261 	bp = hp->scratch;
1262 	bp->saddr = saddr;
1263 	bp->daddr = daddr;
1264 	bp->pad = 0;
1265 	bp->protocol = IPPROTO_TCP;
1266 	bp->len = cpu_to_be16(nbytes);
1267 
1268 	_th = (struct tcphdr *)(bp + 1);
1269 	memcpy(_th, th, sizeof(*th));
1270 	_th->check = 0;
1271 
1272 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1273 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1274 				sizeof(*bp) + sizeof(*th));
1275 	return crypto_ahash_update(hp->md5_req);
1276 }
1277 
1278 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1279 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1280 {
1281 	struct tcp_md5sig_pool *hp;
1282 	struct ahash_request *req;
1283 
1284 	hp = tcp_get_md5sig_pool();
1285 	if (!hp)
1286 		goto clear_hash_noput;
1287 	req = hp->md5_req;
1288 
1289 	if (crypto_ahash_init(req))
1290 		goto clear_hash;
1291 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1292 		goto clear_hash;
1293 	if (tcp_md5_hash_key(hp, key))
1294 		goto clear_hash;
1295 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1296 	if (crypto_ahash_final(req))
1297 		goto clear_hash;
1298 
1299 	tcp_put_md5sig_pool();
1300 	return 0;
1301 
1302 clear_hash:
1303 	tcp_put_md5sig_pool();
1304 clear_hash_noput:
1305 	memset(md5_hash, 0, 16);
1306 	return 1;
1307 }
1308 
1309 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1310 			const struct sock *sk,
1311 			const struct sk_buff *skb)
1312 {
1313 	struct tcp_md5sig_pool *hp;
1314 	struct ahash_request *req;
1315 	const struct tcphdr *th = tcp_hdr(skb);
1316 	__be32 saddr, daddr;
1317 
1318 	if (sk) { /* valid for establish/request sockets */
1319 		saddr = sk->sk_rcv_saddr;
1320 		daddr = sk->sk_daddr;
1321 	} else {
1322 		const struct iphdr *iph = ip_hdr(skb);
1323 		saddr = iph->saddr;
1324 		daddr = iph->daddr;
1325 	}
1326 
1327 	hp = tcp_get_md5sig_pool();
1328 	if (!hp)
1329 		goto clear_hash_noput;
1330 	req = hp->md5_req;
1331 
1332 	if (crypto_ahash_init(req))
1333 		goto clear_hash;
1334 
1335 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1336 		goto clear_hash;
1337 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1338 		goto clear_hash;
1339 	if (tcp_md5_hash_key(hp, key))
1340 		goto clear_hash;
1341 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1342 	if (crypto_ahash_final(req))
1343 		goto clear_hash;
1344 
1345 	tcp_put_md5sig_pool();
1346 	return 0;
1347 
1348 clear_hash:
1349 	tcp_put_md5sig_pool();
1350 clear_hash_noput:
1351 	memset(md5_hash, 0, 16);
1352 	return 1;
1353 }
1354 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1355 
1356 #endif
1357 
1358 /* Called with rcu_read_lock() */
1359 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1360 				    const struct sk_buff *skb,
1361 				    int dif, int sdif)
1362 {
1363 #ifdef CONFIG_TCP_MD5SIG
1364 	/*
1365 	 * This gets called for each TCP segment that arrives
1366 	 * so we want to be efficient.
1367 	 * We have 3 drop cases:
1368 	 * o No MD5 hash and one expected.
1369 	 * o MD5 hash and we're not expecting one.
1370 	 * o MD5 hash and its wrong.
1371 	 */
1372 	const __u8 *hash_location = NULL;
1373 	struct tcp_md5sig_key *hash_expected;
1374 	const struct iphdr *iph = ip_hdr(skb);
1375 	const struct tcphdr *th = tcp_hdr(skb);
1376 	const union tcp_md5_addr *addr;
1377 	unsigned char newhash[16];
1378 	int genhash, l3index;
1379 
1380 	/* sdif set, means packet ingressed via a device
1381 	 * in an L3 domain and dif is set to the l3mdev
1382 	 */
1383 	l3index = sdif ? dif : 0;
1384 
1385 	addr = (union tcp_md5_addr *)&iph->saddr;
1386 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1387 	hash_location = tcp_parse_md5sig_option(th);
1388 
1389 	/* We've parsed the options - do we have a hash? */
1390 	if (!hash_expected && !hash_location)
1391 		return false;
1392 
1393 	if (hash_expected && !hash_location) {
1394 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1395 		return true;
1396 	}
1397 
1398 	if (!hash_expected && hash_location) {
1399 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1400 		return true;
1401 	}
1402 
1403 	/* Okay, so this is hash_expected and hash_location -
1404 	 * so we need to calculate the checksum.
1405 	 */
1406 	genhash = tcp_v4_md5_hash_skb(newhash,
1407 				      hash_expected,
1408 				      NULL, skb);
1409 
1410 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1411 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1412 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1413 				     &iph->saddr, ntohs(th->source),
1414 				     &iph->daddr, ntohs(th->dest),
1415 				     genhash ? " tcp_v4_calc_md5_hash failed"
1416 				     : "", l3index);
1417 		return true;
1418 	}
1419 	return false;
1420 #endif
1421 	return false;
1422 }
1423 
1424 static void tcp_v4_init_req(struct request_sock *req,
1425 			    const struct sock *sk_listener,
1426 			    struct sk_buff *skb)
1427 {
1428 	struct inet_request_sock *ireq = inet_rsk(req);
1429 	struct net *net = sock_net(sk_listener);
1430 
1431 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1432 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1433 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1434 }
1435 
1436 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1437 					  struct flowi *fl,
1438 					  const struct request_sock *req)
1439 {
1440 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1441 }
1442 
1443 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1444 	.family		=	PF_INET,
1445 	.obj_size	=	sizeof(struct tcp_request_sock),
1446 	.rtx_syn_ack	=	tcp_rtx_synack,
1447 	.send_ack	=	tcp_v4_reqsk_send_ack,
1448 	.destructor	=	tcp_v4_reqsk_destructor,
1449 	.send_reset	=	tcp_v4_send_reset,
1450 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1451 };
1452 
1453 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1454 	.mss_clamp	=	TCP_MSS_DEFAULT,
1455 #ifdef CONFIG_TCP_MD5SIG
1456 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1457 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1458 #endif
1459 	.init_req	=	tcp_v4_init_req,
1460 #ifdef CONFIG_SYN_COOKIES
1461 	.cookie_init_seq =	cookie_v4_init_sequence,
1462 #endif
1463 	.route_req	=	tcp_v4_route_req,
1464 	.init_seq	=	tcp_v4_init_seq,
1465 	.init_ts_off	=	tcp_v4_init_ts_off,
1466 	.send_synack	=	tcp_v4_send_synack,
1467 };
1468 
1469 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1470 {
1471 	/* Never answer to SYNs send to broadcast or multicast */
1472 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1473 		goto drop;
1474 
1475 	return tcp_conn_request(&tcp_request_sock_ops,
1476 				&tcp_request_sock_ipv4_ops, sk, skb);
1477 
1478 drop:
1479 	tcp_listendrop(sk);
1480 	return 0;
1481 }
1482 EXPORT_SYMBOL(tcp_v4_conn_request);
1483 
1484 
1485 /*
1486  * The three way handshake has completed - we got a valid synack -
1487  * now create the new socket.
1488  */
1489 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1490 				  struct request_sock *req,
1491 				  struct dst_entry *dst,
1492 				  struct request_sock *req_unhash,
1493 				  bool *own_req)
1494 {
1495 	struct inet_request_sock *ireq;
1496 	struct inet_sock *newinet;
1497 	struct tcp_sock *newtp;
1498 	struct sock *newsk;
1499 #ifdef CONFIG_TCP_MD5SIG
1500 	const union tcp_md5_addr *addr;
1501 	struct tcp_md5sig_key *key;
1502 	int l3index;
1503 #endif
1504 	struct ip_options_rcu *inet_opt;
1505 
1506 	if (sk_acceptq_is_full(sk))
1507 		goto exit_overflow;
1508 
1509 	newsk = tcp_create_openreq_child(sk, req, skb);
1510 	if (!newsk)
1511 		goto exit_nonewsk;
1512 
1513 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1514 	inet_sk_rx_dst_set(newsk, skb);
1515 
1516 	newtp		      = tcp_sk(newsk);
1517 	newinet		      = inet_sk(newsk);
1518 	ireq		      = inet_rsk(req);
1519 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1520 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1521 	newsk->sk_bound_dev_if = ireq->ir_iif;
1522 	newinet->inet_saddr   = ireq->ir_loc_addr;
1523 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1524 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1525 	newinet->mc_index     = inet_iif(skb);
1526 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1527 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1528 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1529 	if (inet_opt)
1530 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1531 	newinet->inet_id = prandom_u32();
1532 
1533 	if (!dst) {
1534 		dst = inet_csk_route_child_sock(sk, newsk, req);
1535 		if (!dst)
1536 			goto put_and_exit;
1537 	} else {
1538 		/* syncookie case : see end of cookie_v4_check() */
1539 	}
1540 	sk_setup_caps(newsk, dst);
1541 
1542 	tcp_ca_openreq_child(newsk, dst);
1543 
1544 	tcp_sync_mss(newsk, dst_mtu(dst));
1545 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1546 
1547 	tcp_initialize_rcv_mss(newsk);
1548 
1549 #ifdef CONFIG_TCP_MD5SIG
1550 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1551 	/* Copy over the MD5 key from the original socket */
1552 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1553 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1554 	if (key) {
1555 		/*
1556 		 * We're using one, so create a matching key
1557 		 * on the newsk structure. If we fail to get
1558 		 * memory, then we end up not copying the key
1559 		 * across. Shucks.
1560 		 */
1561 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1562 			       key->key, key->keylen, GFP_ATOMIC);
1563 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1564 	}
1565 #endif
1566 
1567 	if (__inet_inherit_port(sk, newsk) < 0)
1568 		goto put_and_exit;
1569 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1570 	if (likely(*own_req)) {
1571 		tcp_move_syn(newtp, req);
1572 		ireq->ireq_opt = NULL;
1573 	} else {
1574 		newinet->inet_opt = NULL;
1575 	}
1576 	return newsk;
1577 
1578 exit_overflow:
1579 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1580 exit_nonewsk:
1581 	dst_release(dst);
1582 exit:
1583 	tcp_listendrop(sk);
1584 	return NULL;
1585 put_and_exit:
1586 	newinet->inet_opt = NULL;
1587 	inet_csk_prepare_forced_close(newsk);
1588 	tcp_done(newsk);
1589 	goto exit;
1590 }
1591 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1592 
1593 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1594 {
1595 #ifdef CONFIG_SYN_COOKIES
1596 	const struct tcphdr *th = tcp_hdr(skb);
1597 
1598 	if (!th->syn)
1599 		sk = cookie_v4_check(sk, skb);
1600 #endif
1601 	return sk;
1602 }
1603 
1604 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1605 			 struct tcphdr *th, u32 *cookie)
1606 {
1607 	u16 mss = 0;
1608 #ifdef CONFIG_SYN_COOKIES
1609 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1610 				    &tcp_request_sock_ipv4_ops, sk, th);
1611 	if (mss) {
1612 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1613 		tcp_synq_overflow(sk);
1614 	}
1615 #endif
1616 	return mss;
1617 }
1618 
1619 /* The socket must have it's spinlock held when we get
1620  * here, unless it is a TCP_LISTEN socket.
1621  *
1622  * We have a potential double-lock case here, so even when
1623  * doing backlog processing we use the BH locking scheme.
1624  * This is because we cannot sleep with the original spinlock
1625  * held.
1626  */
1627 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1628 {
1629 	struct sock *rsk;
1630 
1631 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1632 		struct dst_entry *dst = sk->sk_rx_dst;
1633 
1634 		sock_rps_save_rxhash(sk, skb);
1635 		sk_mark_napi_id(sk, skb);
1636 		if (dst) {
1637 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1638 			    !dst->ops->check(dst, 0)) {
1639 				dst_release(dst);
1640 				sk->sk_rx_dst = NULL;
1641 			}
1642 		}
1643 		tcp_rcv_established(sk, skb);
1644 		return 0;
1645 	}
1646 
1647 	if (tcp_checksum_complete(skb))
1648 		goto csum_err;
1649 
1650 	if (sk->sk_state == TCP_LISTEN) {
1651 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1652 
1653 		if (!nsk)
1654 			goto discard;
1655 		if (nsk != sk) {
1656 			if (tcp_child_process(sk, nsk, skb)) {
1657 				rsk = nsk;
1658 				goto reset;
1659 			}
1660 			return 0;
1661 		}
1662 	} else
1663 		sock_rps_save_rxhash(sk, skb);
1664 
1665 	if (tcp_rcv_state_process(sk, skb)) {
1666 		rsk = sk;
1667 		goto reset;
1668 	}
1669 	return 0;
1670 
1671 reset:
1672 	tcp_v4_send_reset(rsk, skb);
1673 discard:
1674 	kfree_skb(skb);
1675 	/* Be careful here. If this function gets more complicated and
1676 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1677 	 * might be destroyed here. This current version compiles correctly,
1678 	 * but you have been warned.
1679 	 */
1680 	return 0;
1681 
1682 csum_err:
1683 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1684 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1685 	goto discard;
1686 }
1687 EXPORT_SYMBOL(tcp_v4_do_rcv);
1688 
1689 int tcp_v4_early_demux(struct sk_buff *skb)
1690 {
1691 	const struct iphdr *iph;
1692 	const struct tcphdr *th;
1693 	struct sock *sk;
1694 
1695 	if (skb->pkt_type != PACKET_HOST)
1696 		return 0;
1697 
1698 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1699 		return 0;
1700 
1701 	iph = ip_hdr(skb);
1702 	th = tcp_hdr(skb);
1703 
1704 	if (th->doff < sizeof(struct tcphdr) / 4)
1705 		return 0;
1706 
1707 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1708 				       iph->saddr, th->source,
1709 				       iph->daddr, ntohs(th->dest),
1710 				       skb->skb_iif, inet_sdif(skb));
1711 	if (sk) {
1712 		skb->sk = sk;
1713 		skb->destructor = sock_edemux;
1714 		if (sk_fullsock(sk)) {
1715 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1716 
1717 			if (dst)
1718 				dst = dst_check(dst, 0);
1719 			if (dst &&
1720 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1721 				skb_dst_set_noref(skb, dst);
1722 		}
1723 	}
1724 	return 0;
1725 }
1726 
1727 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1728 {
1729 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1730 	struct skb_shared_info *shinfo;
1731 	const struct tcphdr *th;
1732 	struct tcphdr *thtail;
1733 	struct sk_buff *tail;
1734 	unsigned int hdrlen;
1735 	bool fragstolen;
1736 	u32 gso_segs;
1737 	int delta;
1738 
1739 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1740 	 * we can fix skb->truesize to its real value to avoid future drops.
1741 	 * This is valid because skb is not yet charged to the socket.
1742 	 * It has been noticed pure SACK packets were sometimes dropped
1743 	 * (if cooked by drivers without copybreak feature).
1744 	 */
1745 	skb_condense(skb);
1746 
1747 	skb_dst_drop(skb);
1748 
1749 	if (unlikely(tcp_checksum_complete(skb))) {
1750 		bh_unlock_sock(sk);
1751 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1752 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1753 		return true;
1754 	}
1755 
1756 	/* Attempt coalescing to last skb in backlog, even if we are
1757 	 * above the limits.
1758 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1759 	 */
1760 	th = (const struct tcphdr *)skb->data;
1761 	hdrlen = th->doff * 4;
1762 	shinfo = skb_shinfo(skb);
1763 
1764 	if (!shinfo->gso_size)
1765 		shinfo->gso_size = skb->len - hdrlen;
1766 
1767 	if (!shinfo->gso_segs)
1768 		shinfo->gso_segs = 1;
1769 
1770 	tail = sk->sk_backlog.tail;
1771 	if (!tail)
1772 		goto no_coalesce;
1773 	thtail = (struct tcphdr *)tail->data;
1774 
1775 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1776 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1777 	    ((TCP_SKB_CB(tail)->tcp_flags |
1778 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1779 	    !((TCP_SKB_CB(tail)->tcp_flags &
1780 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1781 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1782 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1783 #ifdef CONFIG_TLS_DEVICE
1784 	    tail->decrypted != skb->decrypted ||
1785 #endif
1786 	    thtail->doff != th->doff ||
1787 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1788 		goto no_coalesce;
1789 
1790 	__skb_pull(skb, hdrlen);
1791 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1792 		thtail->window = th->window;
1793 
1794 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1795 
1796 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1797 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1798 
1799 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1800 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1801 		 * is not entered if we append a packet with a FIN.
1802 		 * SYN, RST, URG are not present.
1803 		 * ACK is set on both packets.
1804 		 * PSH : we do not really care in TCP stack,
1805 		 *       at least for 'GRO' packets.
1806 		 */
1807 		thtail->fin |= th->fin;
1808 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1809 
1810 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1811 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1812 			tail->tstamp = skb->tstamp;
1813 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1814 		}
1815 
1816 		/* Not as strict as GRO. We only need to carry mss max value */
1817 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1818 						 skb_shinfo(tail)->gso_size);
1819 
1820 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1821 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1822 
1823 		sk->sk_backlog.len += delta;
1824 		__NET_INC_STATS(sock_net(sk),
1825 				LINUX_MIB_TCPBACKLOGCOALESCE);
1826 		kfree_skb_partial(skb, fragstolen);
1827 		return false;
1828 	}
1829 	__skb_push(skb, hdrlen);
1830 
1831 no_coalesce:
1832 	/* Only socket owner can try to collapse/prune rx queues
1833 	 * to reduce memory overhead, so add a little headroom here.
1834 	 * Few sockets backlog are possibly concurrently non empty.
1835 	 */
1836 	limit += 64*1024;
1837 
1838 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1839 		bh_unlock_sock(sk);
1840 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1841 		return true;
1842 	}
1843 	return false;
1844 }
1845 EXPORT_SYMBOL(tcp_add_backlog);
1846 
1847 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1848 {
1849 	struct tcphdr *th = (struct tcphdr *)skb->data;
1850 
1851 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1852 }
1853 EXPORT_SYMBOL(tcp_filter);
1854 
1855 static void tcp_v4_restore_cb(struct sk_buff *skb)
1856 {
1857 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1858 		sizeof(struct inet_skb_parm));
1859 }
1860 
1861 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1862 			   const struct tcphdr *th)
1863 {
1864 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1865 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1866 	 */
1867 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1868 		sizeof(struct inet_skb_parm));
1869 	barrier();
1870 
1871 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1872 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1873 				    skb->len - th->doff * 4);
1874 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1875 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1876 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1877 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1878 	TCP_SKB_CB(skb)->sacked	 = 0;
1879 	TCP_SKB_CB(skb)->has_rxtstamp =
1880 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1881 }
1882 
1883 /*
1884  *	From tcp_input.c
1885  */
1886 
1887 int tcp_v4_rcv(struct sk_buff *skb)
1888 {
1889 	struct net *net = dev_net(skb->dev);
1890 	struct sk_buff *skb_to_free;
1891 	int sdif = inet_sdif(skb);
1892 	int dif = inet_iif(skb);
1893 	const struct iphdr *iph;
1894 	const struct tcphdr *th;
1895 	bool refcounted;
1896 	struct sock *sk;
1897 	int ret;
1898 
1899 	if (skb->pkt_type != PACKET_HOST)
1900 		goto discard_it;
1901 
1902 	/* Count it even if it's bad */
1903 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1904 
1905 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1906 		goto discard_it;
1907 
1908 	th = (const struct tcphdr *)skb->data;
1909 
1910 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1911 		goto bad_packet;
1912 	if (!pskb_may_pull(skb, th->doff * 4))
1913 		goto discard_it;
1914 
1915 	/* An explanation is required here, I think.
1916 	 * Packet length and doff are validated by header prediction,
1917 	 * provided case of th->doff==0 is eliminated.
1918 	 * So, we defer the checks. */
1919 
1920 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1921 		goto csum_error;
1922 
1923 	th = (const struct tcphdr *)skb->data;
1924 	iph = ip_hdr(skb);
1925 lookup:
1926 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1927 			       th->dest, sdif, &refcounted);
1928 	if (!sk)
1929 		goto no_tcp_socket;
1930 
1931 process:
1932 	if (sk->sk_state == TCP_TIME_WAIT)
1933 		goto do_time_wait;
1934 
1935 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1936 		struct request_sock *req = inet_reqsk(sk);
1937 		bool req_stolen = false;
1938 		struct sock *nsk;
1939 
1940 		sk = req->rsk_listener;
1941 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1942 			sk_drops_add(sk, skb);
1943 			reqsk_put(req);
1944 			goto discard_it;
1945 		}
1946 		if (tcp_checksum_complete(skb)) {
1947 			reqsk_put(req);
1948 			goto csum_error;
1949 		}
1950 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1951 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1952 			goto lookup;
1953 		}
1954 		/* We own a reference on the listener, increase it again
1955 		 * as we might lose it too soon.
1956 		 */
1957 		sock_hold(sk);
1958 		refcounted = true;
1959 		nsk = NULL;
1960 		if (!tcp_filter(sk, skb)) {
1961 			th = (const struct tcphdr *)skb->data;
1962 			iph = ip_hdr(skb);
1963 			tcp_v4_fill_cb(skb, iph, th);
1964 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1965 		}
1966 		if (!nsk) {
1967 			reqsk_put(req);
1968 			if (req_stolen) {
1969 				/* Another cpu got exclusive access to req
1970 				 * and created a full blown socket.
1971 				 * Try to feed this packet to this socket
1972 				 * instead of discarding it.
1973 				 */
1974 				tcp_v4_restore_cb(skb);
1975 				sock_put(sk);
1976 				goto lookup;
1977 			}
1978 			goto discard_and_relse;
1979 		}
1980 		if (nsk == sk) {
1981 			reqsk_put(req);
1982 			tcp_v4_restore_cb(skb);
1983 		} else if (tcp_child_process(sk, nsk, skb)) {
1984 			tcp_v4_send_reset(nsk, skb);
1985 			goto discard_and_relse;
1986 		} else {
1987 			sock_put(sk);
1988 			return 0;
1989 		}
1990 	}
1991 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1992 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1993 		goto discard_and_relse;
1994 	}
1995 
1996 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1997 		goto discard_and_relse;
1998 
1999 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2000 		goto discard_and_relse;
2001 
2002 	nf_reset_ct(skb);
2003 
2004 	if (tcp_filter(sk, skb))
2005 		goto discard_and_relse;
2006 	th = (const struct tcphdr *)skb->data;
2007 	iph = ip_hdr(skb);
2008 	tcp_v4_fill_cb(skb, iph, th);
2009 
2010 	skb->dev = NULL;
2011 
2012 	if (sk->sk_state == TCP_LISTEN) {
2013 		ret = tcp_v4_do_rcv(sk, skb);
2014 		goto put_and_return;
2015 	}
2016 
2017 	sk_incoming_cpu_update(sk);
2018 
2019 	bh_lock_sock_nested(sk);
2020 	tcp_segs_in(tcp_sk(sk), skb);
2021 	ret = 0;
2022 	if (!sock_owned_by_user(sk)) {
2023 		skb_to_free = sk->sk_rx_skb_cache;
2024 		sk->sk_rx_skb_cache = NULL;
2025 		ret = tcp_v4_do_rcv(sk, skb);
2026 	} else {
2027 		if (tcp_add_backlog(sk, skb))
2028 			goto discard_and_relse;
2029 		skb_to_free = NULL;
2030 	}
2031 	bh_unlock_sock(sk);
2032 	if (skb_to_free)
2033 		__kfree_skb(skb_to_free);
2034 
2035 put_and_return:
2036 	if (refcounted)
2037 		sock_put(sk);
2038 
2039 	return ret;
2040 
2041 no_tcp_socket:
2042 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2043 		goto discard_it;
2044 
2045 	tcp_v4_fill_cb(skb, iph, th);
2046 
2047 	if (tcp_checksum_complete(skb)) {
2048 csum_error:
2049 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2050 bad_packet:
2051 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2052 	} else {
2053 		tcp_v4_send_reset(NULL, skb);
2054 	}
2055 
2056 discard_it:
2057 	/* Discard frame. */
2058 	kfree_skb(skb);
2059 	return 0;
2060 
2061 discard_and_relse:
2062 	sk_drops_add(sk, skb);
2063 	if (refcounted)
2064 		sock_put(sk);
2065 	goto discard_it;
2066 
2067 do_time_wait:
2068 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2069 		inet_twsk_put(inet_twsk(sk));
2070 		goto discard_it;
2071 	}
2072 
2073 	tcp_v4_fill_cb(skb, iph, th);
2074 
2075 	if (tcp_checksum_complete(skb)) {
2076 		inet_twsk_put(inet_twsk(sk));
2077 		goto csum_error;
2078 	}
2079 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2080 	case TCP_TW_SYN: {
2081 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2082 							&tcp_hashinfo, skb,
2083 							__tcp_hdrlen(th),
2084 							iph->saddr, th->source,
2085 							iph->daddr, th->dest,
2086 							inet_iif(skb),
2087 							sdif);
2088 		if (sk2) {
2089 			inet_twsk_deschedule_put(inet_twsk(sk));
2090 			sk = sk2;
2091 			tcp_v4_restore_cb(skb);
2092 			refcounted = false;
2093 			goto process;
2094 		}
2095 	}
2096 		/* to ACK */
2097 		fallthrough;
2098 	case TCP_TW_ACK:
2099 		tcp_v4_timewait_ack(sk, skb);
2100 		break;
2101 	case TCP_TW_RST:
2102 		tcp_v4_send_reset(sk, skb);
2103 		inet_twsk_deschedule_put(inet_twsk(sk));
2104 		goto discard_it;
2105 	case TCP_TW_SUCCESS:;
2106 	}
2107 	goto discard_it;
2108 }
2109 
2110 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2111 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2112 	.twsk_unique	= tcp_twsk_unique,
2113 	.twsk_destructor= tcp_twsk_destructor,
2114 };
2115 
2116 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2117 {
2118 	struct dst_entry *dst = skb_dst(skb);
2119 
2120 	if (dst && dst_hold_safe(dst)) {
2121 		sk->sk_rx_dst = dst;
2122 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2123 	}
2124 }
2125 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2126 
2127 const struct inet_connection_sock_af_ops ipv4_specific = {
2128 	.queue_xmit	   = ip_queue_xmit,
2129 	.send_check	   = tcp_v4_send_check,
2130 	.rebuild_header	   = inet_sk_rebuild_header,
2131 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2132 	.conn_request	   = tcp_v4_conn_request,
2133 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2134 	.net_header_len	   = sizeof(struct iphdr),
2135 	.setsockopt	   = ip_setsockopt,
2136 	.getsockopt	   = ip_getsockopt,
2137 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2138 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2139 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2140 };
2141 EXPORT_SYMBOL(ipv4_specific);
2142 
2143 #ifdef CONFIG_TCP_MD5SIG
2144 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2145 	.md5_lookup		= tcp_v4_md5_lookup,
2146 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2147 	.md5_parse		= tcp_v4_parse_md5_keys,
2148 };
2149 #endif
2150 
2151 /* NOTE: A lot of things set to zero explicitly by call to
2152  *       sk_alloc() so need not be done here.
2153  */
2154 static int tcp_v4_init_sock(struct sock *sk)
2155 {
2156 	struct inet_connection_sock *icsk = inet_csk(sk);
2157 
2158 	tcp_init_sock(sk);
2159 
2160 	icsk->icsk_af_ops = &ipv4_specific;
2161 
2162 #ifdef CONFIG_TCP_MD5SIG
2163 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2164 #endif
2165 
2166 	return 0;
2167 }
2168 
2169 void tcp_v4_destroy_sock(struct sock *sk)
2170 {
2171 	struct tcp_sock *tp = tcp_sk(sk);
2172 
2173 	trace_tcp_destroy_sock(sk);
2174 
2175 	tcp_clear_xmit_timers(sk);
2176 
2177 	tcp_cleanup_congestion_control(sk);
2178 
2179 	tcp_cleanup_ulp(sk);
2180 
2181 	/* Cleanup up the write buffer. */
2182 	tcp_write_queue_purge(sk);
2183 
2184 	/* Check if we want to disable active TFO */
2185 	tcp_fastopen_active_disable_ofo_check(sk);
2186 
2187 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2188 	skb_rbtree_purge(&tp->out_of_order_queue);
2189 
2190 #ifdef CONFIG_TCP_MD5SIG
2191 	/* Clean up the MD5 key list, if any */
2192 	if (tp->md5sig_info) {
2193 		tcp_clear_md5_list(sk);
2194 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2195 		tp->md5sig_info = NULL;
2196 	}
2197 #endif
2198 
2199 	/* Clean up a referenced TCP bind bucket. */
2200 	if (inet_csk(sk)->icsk_bind_hash)
2201 		inet_put_port(sk);
2202 
2203 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2204 
2205 	/* If socket is aborted during connect operation */
2206 	tcp_free_fastopen_req(tp);
2207 	tcp_fastopen_destroy_cipher(sk);
2208 	tcp_saved_syn_free(tp);
2209 
2210 	sk_sockets_allocated_dec(sk);
2211 }
2212 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2213 
2214 #ifdef CONFIG_PROC_FS
2215 /* Proc filesystem TCP sock list dumping. */
2216 
2217 /*
2218  * Get next listener socket follow cur.  If cur is NULL, get first socket
2219  * starting from bucket given in st->bucket; when st->bucket is zero the
2220  * very first socket in the hash table is returned.
2221  */
2222 static void *listening_get_next(struct seq_file *seq, void *cur)
2223 {
2224 	struct tcp_seq_afinfo *afinfo;
2225 	struct tcp_iter_state *st = seq->private;
2226 	struct net *net = seq_file_net(seq);
2227 	struct inet_listen_hashbucket *ilb;
2228 	struct hlist_nulls_node *node;
2229 	struct sock *sk = cur;
2230 
2231 	if (st->bpf_seq_afinfo)
2232 		afinfo = st->bpf_seq_afinfo;
2233 	else
2234 		afinfo = PDE_DATA(file_inode(seq->file));
2235 
2236 	if (!sk) {
2237 get_head:
2238 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2239 		spin_lock(&ilb->lock);
2240 		sk = sk_nulls_head(&ilb->nulls_head);
2241 		st->offset = 0;
2242 		goto get_sk;
2243 	}
2244 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2245 	++st->num;
2246 	++st->offset;
2247 
2248 	sk = sk_nulls_next(sk);
2249 get_sk:
2250 	sk_nulls_for_each_from(sk, node) {
2251 		if (!net_eq(sock_net(sk), net))
2252 			continue;
2253 		if (afinfo->family == AF_UNSPEC ||
2254 		    sk->sk_family == afinfo->family)
2255 			return sk;
2256 	}
2257 	spin_unlock(&ilb->lock);
2258 	st->offset = 0;
2259 	if (++st->bucket < INET_LHTABLE_SIZE)
2260 		goto get_head;
2261 	return NULL;
2262 }
2263 
2264 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2265 {
2266 	struct tcp_iter_state *st = seq->private;
2267 	void *rc;
2268 
2269 	st->bucket = 0;
2270 	st->offset = 0;
2271 	rc = listening_get_next(seq, NULL);
2272 
2273 	while (rc && *pos) {
2274 		rc = listening_get_next(seq, rc);
2275 		--*pos;
2276 	}
2277 	return rc;
2278 }
2279 
2280 static inline bool empty_bucket(const struct tcp_iter_state *st)
2281 {
2282 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2283 }
2284 
2285 /*
2286  * Get first established socket starting from bucket given in st->bucket.
2287  * If st->bucket is zero, the very first socket in the hash is returned.
2288  */
2289 static void *established_get_first(struct seq_file *seq)
2290 {
2291 	struct tcp_seq_afinfo *afinfo;
2292 	struct tcp_iter_state *st = seq->private;
2293 	struct net *net = seq_file_net(seq);
2294 	void *rc = NULL;
2295 
2296 	if (st->bpf_seq_afinfo)
2297 		afinfo = st->bpf_seq_afinfo;
2298 	else
2299 		afinfo = PDE_DATA(file_inode(seq->file));
2300 
2301 	st->offset = 0;
2302 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2303 		struct sock *sk;
2304 		struct hlist_nulls_node *node;
2305 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2306 
2307 		/* Lockless fast path for the common case of empty buckets */
2308 		if (empty_bucket(st))
2309 			continue;
2310 
2311 		spin_lock_bh(lock);
2312 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2313 			if ((afinfo->family != AF_UNSPEC &&
2314 			     sk->sk_family != afinfo->family) ||
2315 			    !net_eq(sock_net(sk), net)) {
2316 				continue;
2317 			}
2318 			rc = sk;
2319 			goto out;
2320 		}
2321 		spin_unlock_bh(lock);
2322 	}
2323 out:
2324 	return rc;
2325 }
2326 
2327 static void *established_get_next(struct seq_file *seq, void *cur)
2328 {
2329 	struct tcp_seq_afinfo *afinfo;
2330 	struct sock *sk = cur;
2331 	struct hlist_nulls_node *node;
2332 	struct tcp_iter_state *st = seq->private;
2333 	struct net *net = seq_file_net(seq);
2334 
2335 	if (st->bpf_seq_afinfo)
2336 		afinfo = st->bpf_seq_afinfo;
2337 	else
2338 		afinfo = PDE_DATA(file_inode(seq->file));
2339 
2340 	++st->num;
2341 	++st->offset;
2342 
2343 	sk = sk_nulls_next(sk);
2344 
2345 	sk_nulls_for_each_from(sk, node) {
2346 		if ((afinfo->family == AF_UNSPEC ||
2347 		     sk->sk_family == afinfo->family) &&
2348 		    net_eq(sock_net(sk), net))
2349 			return sk;
2350 	}
2351 
2352 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2353 	++st->bucket;
2354 	return established_get_first(seq);
2355 }
2356 
2357 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2358 {
2359 	struct tcp_iter_state *st = seq->private;
2360 	void *rc;
2361 
2362 	st->bucket = 0;
2363 	rc = established_get_first(seq);
2364 
2365 	while (rc && pos) {
2366 		rc = established_get_next(seq, rc);
2367 		--pos;
2368 	}
2369 	return rc;
2370 }
2371 
2372 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2373 {
2374 	void *rc;
2375 	struct tcp_iter_state *st = seq->private;
2376 
2377 	st->state = TCP_SEQ_STATE_LISTENING;
2378 	rc	  = listening_get_idx(seq, &pos);
2379 
2380 	if (!rc) {
2381 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2382 		rc	  = established_get_idx(seq, pos);
2383 	}
2384 
2385 	return rc;
2386 }
2387 
2388 static void *tcp_seek_last_pos(struct seq_file *seq)
2389 {
2390 	struct tcp_iter_state *st = seq->private;
2391 	int offset = st->offset;
2392 	int orig_num = st->num;
2393 	void *rc = NULL;
2394 
2395 	switch (st->state) {
2396 	case TCP_SEQ_STATE_LISTENING:
2397 		if (st->bucket >= INET_LHTABLE_SIZE)
2398 			break;
2399 		st->state = TCP_SEQ_STATE_LISTENING;
2400 		rc = listening_get_next(seq, NULL);
2401 		while (offset-- && rc)
2402 			rc = listening_get_next(seq, rc);
2403 		if (rc)
2404 			break;
2405 		st->bucket = 0;
2406 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2407 		fallthrough;
2408 	case TCP_SEQ_STATE_ESTABLISHED:
2409 		if (st->bucket > tcp_hashinfo.ehash_mask)
2410 			break;
2411 		rc = established_get_first(seq);
2412 		while (offset-- && rc)
2413 			rc = established_get_next(seq, rc);
2414 	}
2415 
2416 	st->num = orig_num;
2417 
2418 	return rc;
2419 }
2420 
2421 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2422 {
2423 	struct tcp_iter_state *st = seq->private;
2424 	void *rc;
2425 
2426 	if (*pos && *pos == st->last_pos) {
2427 		rc = tcp_seek_last_pos(seq);
2428 		if (rc)
2429 			goto out;
2430 	}
2431 
2432 	st->state = TCP_SEQ_STATE_LISTENING;
2433 	st->num = 0;
2434 	st->bucket = 0;
2435 	st->offset = 0;
2436 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2437 
2438 out:
2439 	st->last_pos = *pos;
2440 	return rc;
2441 }
2442 EXPORT_SYMBOL(tcp_seq_start);
2443 
2444 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2445 {
2446 	struct tcp_iter_state *st = seq->private;
2447 	void *rc = NULL;
2448 
2449 	if (v == SEQ_START_TOKEN) {
2450 		rc = tcp_get_idx(seq, 0);
2451 		goto out;
2452 	}
2453 
2454 	switch (st->state) {
2455 	case TCP_SEQ_STATE_LISTENING:
2456 		rc = listening_get_next(seq, v);
2457 		if (!rc) {
2458 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2459 			st->bucket = 0;
2460 			st->offset = 0;
2461 			rc	  = established_get_first(seq);
2462 		}
2463 		break;
2464 	case TCP_SEQ_STATE_ESTABLISHED:
2465 		rc = established_get_next(seq, v);
2466 		break;
2467 	}
2468 out:
2469 	++*pos;
2470 	st->last_pos = *pos;
2471 	return rc;
2472 }
2473 EXPORT_SYMBOL(tcp_seq_next);
2474 
2475 void tcp_seq_stop(struct seq_file *seq, void *v)
2476 {
2477 	struct tcp_iter_state *st = seq->private;
2478 
2479 	switch (st->state) {
2480 	case TCP_SEQ_STATE_LISTENING:
2481 		if (v != SEQ_START_TOKEN)
2482 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2483 		break;
2484 	case TCP_SEQ_STATE_ESTABLISHED:
2485 		if (v)
2486 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2487 		break;
2488 	}
2489 }
2490 EXPORT_SYMBOL(tcp_seq_stop);
2491 
2492 static void get_openreq4(const struct request_sock *req,
2493 			 struct seq_file *f, int i)
2494 {
2495 	const struct inet_request_sock *ireq = inet_rsk(req);
2496 	long delta = req->rsk_timer.expires - jiffies;
2497 
2498 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2499 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2500 		i,
2501 		ireq->ir_loc_addr,
2502 		ireq->ir_num,
2503 		ireq->ir_rmt_addr,
2504 		ntohs(ireq->ir_rmt_port),
2505 		TCP_SYN_RECV,
2506 		0, 0, /* could print option size, but that is af dependent. */
2507 		1,    /* timers active (only the expire timer) */
2508 		jiffies_delta_to_clock_t(delta),
2509 		req->num_timeout,
2510 		from_kuid_munged(seq_user_ns(f),
2511 				 sock_i_uid(req->rsk_listener)),
2512 		0,  /* non standard timer */
2513 		0, /* open_requests have no inode */
2514 		0,
2515 		req);
2516 }
2517 
2518 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2519 {
2520 	int timer_active;
2521 	unsigned long timer_expires;
2522 	const struct tcp_sock *tp = tcp_sk(sk);
2523 	const struct inet_connection_sock *icsk = inet_csk(sk);
2524 	const struct inet_sock *inet = inet_sk(sk);
2525 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2526 	__be32 dest = inet->inet_daddr;
2527 	__be32 src = inet->inet_rcv_saddr;
2528 	__u16 destp = ntohs(inet->inet_dport);
2529 	__u16 srcp = ntohs(inet->inet_sport);
2530 	int rx_queue;
2531 	int state;
2532 
2533 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2534 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2535 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2536 		timer_active	= 1;
2537 		timer_expires	= icsk->icsk_timeout;
2538 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2539 		timer_active	= 4;
2540 		timer_expires	= icsk->icsk_timeout;
2541 	} else if (timer_pending(&sk->sk_timer)) {
2542 		timer_active	= 2;
2543 		timer_expires	= sk->sk_timer.expires;
2544 	} else {
2545 		timer_active	= 0;
2546 		timer_expires = jiffies;
2547 	}
2548 
2549 	state = inet_sk_state_load(sk);
2550 	if (state == TCP_LISTEN)
2551 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2552 	else
2553 		/* Because we don't lock the socket,
2554 		 * we might find a transient negative value.
2555 		 */
2556 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2557 				      READ_ONCE(tp->copied_seq), 0);
2558 
2559 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2560 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2561 		i, src, srcp, dest, destp, state,
2562 		READ_ONCE(tp->write_seq) - tp->snd_una,
2563 		rx_queue,
2564 		timer_active,
2565 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2566 		icsk->icsk_retransmits,
2567 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2568 		icsk->icsk_probes_out,
2569 		sock_i_ino(sk),
2570 		refcount_read(&sk->sk_refcnt), sk,
2571 		jiffies_to_clock_t(icsk->icsk_rto),
2572 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2573 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2574 		tp->snd_cwnd,
2575 		state == TCP_LISTEN ?
2576 		    fastopenq->max_qlen :
2577 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2578 }
2579 
2580 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2581 			       struct seq_file *f, int i)
2582 {
2583 	long delta = tw->tw_timer.expires - jiffies;
2584 	__be32 dest, src;
2585 	__u16 destp, srcp;
2586 
2587 	dest  = tw->tw_daddr;
2588 	src   = tw->tw_rcv_saddr;
2589 	destp = ntohs(tw->tw_dport);
2590 	srcp  = ntohs(tw->tw_sport);
2591 
2592 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2593 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2594 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2595 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2596 		refcount_read(&tw->tw_refcnt), tw);
2597 }
2598 
2599 #define TMPSZ 150
2600 
2601 static int tcp4_seq_show(struct seq_file *seq, void *v)
2602 {
2603 	struct tcp_iter_state *st;
2604 	struct sock *sk = v;
2605 
2606 	seq_setwidth(seq, TMPSZ - 1);
2607 	if (v == SEQ_START_TOKEN) {
2608 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2609 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2610 			   "inode");
2611 		goto out;
2612 	}
2613 	st = seq->private;
2614 
2615 	if (sk->sk_state == TCP_TIME_WAIT)
2616 		get_timewait4_sock(v, seq, st->num);
2617 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2618 		get_openreq4(v, seq, st->num);
2619 	else
2620 		get_tcp4_sock(v, seq, st->num);
2621 out:
2622 	seq_pad(seq, '\n');
2623 	return 0;
2624 }
2625 
2626 #ifdef CONFIG_BPF_SYSCALL
2627 struct bpf_iter__tcp {
2628 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2629 	__bpf_md_ptr(struct sock_common *, sk_common);
2630 	uid_t uid __aligned(8);
2631 };
2632 
2633 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2634 			     struct sock_common *sk_common, uid_t uid)
2635 {
2636 	struct bpf_iter__tcp ctx;
2637 
2638 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2639 	ctx.meta = meta;
2640 	ctx.sk_common = sk_common;
2641 	ctx.uid = uid;
2642 	return bpf_iter_run_prog(prog, &ctx);
2643 }
2644 
2645 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2646 {
2647 	struct bpf_iter_meta meta;
2648 	struct bpf_prog *prog;
2649 	struct sock *sk = v;
2650 	uid_t uid;
2651 
2652 	if (v == SEQ_START_TOKEN)
2653 		return 0;
2654 
2655 	if (sk->sk_state == TCP_TIME_WAIT) {
2656 		uid = 0;
2657 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2658 		const struct request_sock *req = v;
2659 
2660 		uid = from_kuid_munged(seq_user_ns(seq),
2661 				       sock_i_uid(req->rsk_listener));
2662 	} else {
2663 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2664 	}
2665 
2666 	meta.seq = seq;
2667 	prog = bpf_iter_get_info(&meta, false);
2668 	return tcp_prog_seq_show(prog, &meta, v, uid);
2669 }
2670 
2671 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2672 {
2673 	struct bpf_iter_meta meta;
2674 	struct bpf_prog *prog;
2675 
2676 	if (!v) {
2677 		meta.seq = seq;
2678 		prog = bpf_iter_get_info(&meta, true);
2679 		if (prog)
2680 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2681 	}
2682 
2683 	tcp_seq_stop(seq, v);
2684 }
2685 
2686 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2687 	.show		= bpf_iter_tcp_seq_show,
2688 	.start		= tcp_seq_start,
2689 	.next		= tcp_seq_next,
2690 	.stop		= bpf_iter_tcp_seq_stop,
2691 };
2692 #endif
2693 
2694 static const struct seq_operations tcp4_seq_ops = {
2695 	.show		= tcp4_seq_show,
2696 	.start		= tcp_seq_start,
2697 	.next		= tcp_seq_next,
2698 	.stop		= tcp_seq_stop,
2699 };
2700 
2701 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2702 	.family		= AF_INET,
2703 };
2704 
2705 static int __net_init tcp4_proc_init_net(struct net *net)
2706 {
2707 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2708 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2709 		return -ENOMEM;
2710 	return 0;
2711 }
2712 
2713 static void __net_exit tcp4_proc_exit_net(struct net *net)
2714 {
2715 	remove_proc_entry("tcp", net->proc_net);
2716 }
2717 
2718 static struct pernet_operations tcp4_net_ops = {
2719 	.init = tcp4_proc_init_net,
2720 	.exit = tcp4_proc_exit_net,
2721 };
2722 
2723 int __init tcp4_proc_init(void)
2724 {
2725 	return register_pernet_subsys(&tcp4_net_ops);
2726 }
2727 
2728 void tcp4_proc_exit(void)
2729 {
2730 	unregister_pernet_subsys(&tcp4_net_ops);
2731 }
2732 #endif /* CONFIG_PROC_FS */
2733 
2734 struct proto tcp_prot = {
2735 	.name			= "TCP",
2736 	.owner			= THIS_MODULE,
2737 	.close			= tcp_close,
2738 	.pre_connect		= tcp_v4_pre_connect,
2739 	.connect		= tcp_v4_connect,
2740 	.disconnect		= tcp_disconnect,
2741 	.accept			= inet_csk_accept,
2742 	.ioctl			= tcp_ioctl,
2743 	.init			= tcp_v4_init_sock,
2744 	.destroy		= tcp_v4_destroy_sock,
2745 	.shutdown		= tcp_shutdown,
2746 	.setsockopt		= tcp_setsockopt,
2747 	.getsockopt		= tcp_getsockopt,
2748 	.keepalive		= tcp_set_keepalive,
2749 	.recvmsg		= tcp_recvmsg,
2750 	.sendmsg		= tcp_sendmsg,
2751 	.sendpage		= tcp_sendpage,
2752 	.backlog_rcv		= tcp_v4_do_rcv,
2753 	.release_cb		= tcp_release_cb,
2754 	.hash			= inet_hash,
2755 	.unhash			= inet_unhash,
2756 	.get_port		= inet_csk_get_port,
2757 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2758 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2759 	.stream_memory_free	= tcp_stream_memory_free,
2760 	.sockets_allocated	= &tcp_sockets_allocated,
2761 	.orphan_count		= &tcp_orphan_count,
2762 	.memory_allocated	= &tcp_memory_allocated,
2763 	.memory_pressure	= &tcp_memory_pressure,
2764 	.sysctl_mem		= sysctl_tcp_mem,
2765 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2766 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2767 	.max_header		= MAX_TCP_HEADER,
2768 	.obj_size		= sizeof(struct tcp_sock),
2769 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2770 	.twsk_prot		= &tcp_timewait_sock_ops,
2771 	.rsk_prot		= &tcp_request_sock_ops,
2772 	.h.hashinfo		= &tcp_hashinfo,
2773 	.no_autobind		= true,
2774 	.diag_destroy		= tcp_abort,
2775 };
2776 EXPORT_SYMBOL(tcp_prot);
2777 
2778 static void __net_exit tcp_sk_exit(struct net *net)
2779 {
2780 	int cpu;
2781 
2782 	if (net->ipv4.tcp_congestion_control)
2783 		bpf_module_put(net->ipv4.tcp_congestion_control,
2784 			       net->ipv4.tcp_congestion_control->owner);
2785 
2786 	for_each_possible_cpu(cpu)
2787 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2788 	free_percpu(net->ipv4.tcp_sk);
2789 }
2790 
2791 static int __net_init tcp_sk_init(struct net *net)
2792 {
2793 	int res, cpu, cnt;
2794 
2795 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2796 	if (!net->ipv4.tcp_sk)
2797 		return -ENOMEM;
2798 
2799 	for_each_possible_cpu(cpu) {
2800 		struct sock *sk;
2801 
2802 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2803 					   IPPROTO_TCP, net);
2804 		if (res)
2805 			goto fail;
2806 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2807 
2808 		/* Please enforce IP_DF and IPID==0 for RST and
2809 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2810 		 */
2811 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2812 
2813 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2814 	}
2815 
2816 	net->ipv4.sysctl_tcp_ecn = 2;
2817 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2818 
2819 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2820 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2821 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2822 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2823 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2824 
2825 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2826 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2827 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2828 
2829 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2830 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2831 	net->ipv4.sysctl_tcp_syncookies = 1;
2832 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2833 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2834 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2835 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2836 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2837 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2838 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2839 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2840 
2841 	cnt = tcp_hashinfo.ehash_mask + 1;
2842 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2843 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2844 
2845 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2846 	net->ipv4.sysctl_tcp_sack = 1;
2847 	net->ipv4.sysctl_tcp_window_scaling = 1;
2848 	net->ipv4.sysctl_tcp_timestamps = 1;
2849 	net->ipv4.sysctl_tcp_early_retrans = 3;
2850 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2851 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2852 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2853 	net->ipv4.sysctl_tcp_max_reordering = 300;
2854 	net->ipv4.sysctl_tcp_dsack = 1;
2855 	net->ipv4.sysctl_tcp_app_win = 31;
2856 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2857 	net->ipv4.sysctl_tcp_frto = 2;
2858 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2859 	/* This limits the percentage of the congestion window which we
2860 	 * will allow a single TSO frame to consume.  Building TSO frames
2861 	 * which are too large can cause TCP streams to be bursty.
2862 	 */
2863 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2864 	/* Default TSQ limit of 16 TSO segments */
2865 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2866 	/* rfc5961 challenge ack rate limiting */
2867 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2868 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2869 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2870 	net->ipv4.sysctl_tcp_autocorking = 1;
2871 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2872 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2873 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2874 	if (net != &init_net) {
2875 		memcpy(net->ipv4.sysctl_tcp_rmem,
2876 		       init_net.ipv4.sysctl_tcp_rmem,
2877 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2878 		memcpy(net->ipv4.sysctl_tcp_wmem,
2879 		       init_net.ipv4.sysctl_tcp_wmem,
2880 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2881 	}
2882 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2883 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2884 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2885 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2886 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2887 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2888 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2889 
2890 	/* Reno is always built in */
2891 	if (!net_eq(net, &init_net) &&
2892 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2893 			       init_net.ipv4.tcp_congestion_control->owner))
2894 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2895 	else
2896 		net->ipv4.tcp_congestion_control = &tcp_reno;
2897 
2898 	return 0;
2899 fail:
2900 	tcp_sk_exit(net);
2901 
2902 	return res;
2903 }
2904 
2905 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2906 {
2907 	struct net *net;
2908 
2909 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2910 
2911 	list_for_each_entry(net, net_exit_list, exit_list)
2912 		tcp_fastopen_ctx_destroy(net);
2913 }
2914 
2915 static struct pernet_operations __net_initdata tcp_sk_ops = {
2916        .init	   = tcp_sk_init,
2917        .exit	   = tcp_sk_exit,
2918        .exit_batch = tcp_sk_exit_batch,
2919 };
2920 
2921 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2922 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2923 		     struct sock_common *sk_common, uid_t uid)
2924 
2925 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
2926 {
2927 	struct tcp_iter_state *st = priv_data;
2928 	struct tcp_seq_afinfo *afinfo;
2929 	int ret;
2930 
2931 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2932 	if (!afinfo)
2933 		return -ENOMEM;
2934 
2935 	afinfo->family = AF_UNSPEC;
2936 	st->bpf_seq_afinfo = afinfo;
2937 	ret = bpf_iter_init_seq_net(priv_data, aux);
2938 	if (ret)
2939 		kfree(afinfo);
2940 	return ret;
2941 }
2942 
2943 static void bpf_iter_fini_tcp(void *priv_data)
2944 {
2945 	struct tcp_iter_state *st = priv_data;
2946 
2947 	kfree(st->bpf_seq_afinfo);
2948 	bpf_iter_fini_seq_net(priv_data);
2949 }
2950 
2951 static const struct bpf_iter_seq_info tcp_seq_info = {
2952 	.seq_ops		= &bpf_iter_tcp_seq_ops,
2953 	.init_seq_private	= bpf_iter_init_tcp,
2954 	.fini_seq_private	= bpf_iter_fini_tcp,
2955 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2956 };
2957 
2958 static struct bpf_iter_reg tcp_reg_info = {
2959 	.target			= "tcp",
2960 	.ctx_arg_info_size	= 1,
2961 	.ctx_arg_info		= {
2962 		{ offsetof(struct bpf_iter__tcp, sk_common),
2963 		  PTR_TO_BTF_ID_OR_NULL },
2964 	},
2965 	.seq_info		= &tcp_seq_info,
2966 };
2967 
2968 static void __init bpf_iter_register(void)
2969 {
2970 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
2971 	if (bpf_iter_reg_target(&tcp_reg_info))
2972 		pr_warn("Warning: could not register bpf iterator tcp\n");
2973 }
2974 
2975 #endif
2976 
2977 void __init tcp_v4_init(void)
2978 {
2979 	if (register_pernet_subsys(&tcp_sk_ops))
2980 		panic("Failed to create the TCP control socket.\n");
2981 
2982 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2983 	bpf_iter_register();
2984 #endif
2985 }
2986