xref: /linux/net/ipv4/tcp_ipv4.c (revision 93a3545d812ae7cfe4426374e00a7d8f64ac02e0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 
80 #include <crypto/hash.h>
81 #include <linux/scatterlist.h>
82 
83 #include <trace/events/tcp.h>
84 
85 #ifdef CONFIG_TCP_MD5SIG
86 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
87 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
88 #endif
89 
90 struct inet_hashinfo tcp_hashinfo;
91 EXPORT_SYMBOL(tcp_hashinfo);
92 
93 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
94 {
95 	return secure_tcp_seq(ip_hdr(skb)->daddr,
96 			      ip_hdr(skb)->saddr,
97 			      tcp_hdr(skb)->dest,
98 			      tcp_hdr(skb)->source);
99 }
100 
101 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
102 {
103 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
104 }
105 
106 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
107 {
108 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
109 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
110 	struct tcp_sock *tp = tcp_sk(sk);
111 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
112 
113 	if (reuse == 2) {
114 		/* Still does not detect *everything* that goes through
115 		 * lo, since we require a loopback src or dst address
116 		 * or direct binding to 'lo' interface.
117 		 */
118 		bool loopback = false;
119 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
120 			loopback = true;
121 #if IS_ENABLED(CONFIG_IPV6)
122 		if (tw->tw_family == AF_INET6) {
123 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
124 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
126 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
127 				loopback = true;
128 		} else
129 #endif
130 		{
131 			if (ipv4_is_loopback(tw->tw_daddr) ||
132 			    ipv4_is_loopback(tw->tw_rcv_saddr))
133 				loopback = true;
134 		}
135 		if (!loopback)
136 			reuse = 0;
137 	}
138 
139 	/* With PAWS, it is safe from the viewpoint
140 	   of data integrity. Even without PAWS it is safe provided sequence
141 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
142 
143 	   Actually, the idea is close to VJ's one, only timestamp cache is
144 	   held not per host, but per port pair and TW bucket is used as state
145 	   holder.
146 
147 	   If TW bucket has been already destroyed we fall back to VJ's scheme
148 	   and use initial timestamp retrieved from peer table.
149 	 */
150 	if (tcptw->tw_ts_recent_stamp &&
151 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
152 					    tcptw->tw_ts_recent_stamp)))) {
153 		/* In case of repair and re-using TIME-WAIT sockets we still
154 		 * want to be sure that it is safe as above but honor the
155 		 * sequence numbers and time stamps set as part of the repair
156 		 * process.
157 		 *
158 		 * Without this check re-using a TIME-WAIT socket with TCP
159 		 * repair would accumulate a -1 on the repair assigned
160 		 * sequence number. The first time it is reused the sequence
161 		 * is -1, the second time -2, etc. This fixes that issue
162 		 * without appearing to create any others.
163 		 */
164 		if (likely(!tp->repair)) {
165 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
166 
167 			if (!seq)
168 				seq = 1;
169 			WRITE_ONCE(tp->write_seq, seq);
170 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
171 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
172 		}
173 		sock_hold(sktw);
174 		return 1;
175 	}
176 
177 	return 0;
178 }
179 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
180 
181 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
182 			      int addr_len)
183 {
184 	/* This check is replicated from tcp_v4_connect() and intended to
185 	 * prevent BPF program called below from accessing bytes that are out
186 	 * of the bound specified by user in addr_len.
187 	 */
188 	if (addr_len < sizeof(struct sockaddr_in))
189 		return -EINVAL;
190 
191 	sock_owned_by_me(sk);
192 
193 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
194 }
195 
196 /* This will initiate an outgoing connection. */
197 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
198 {
199 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
200 	struct inet_sock *inet = inet_sk(sk);
201 	struct tcp_sock *tp = tcp_sk(sk);
202 	__be16 orig_sport, orig_dport;
203 	__be32 daddr, nexthop;
204 	struct flowi4 *fl4;
205 	struct rtable *rt;
206 	int err;
207 	struct ip_options_rcu *inet_opt;
208 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
209 
210 	if (addr_len < sizeof(struct sockaddr_in))
211 		return -EINVAL;
212 
213 	if (usin->sin_family != AF_INET)
214 		return -EAFNOSUPPORT;
215 
216 	nexthop = daddr = usin->sin_addr.s_addr;
217 	inet_opt = rcu_dereference_protected(inet->inet_opt,
218 					     lockdep_sock_is_held(sk));
219 	if (inet_opt && inet_opt->opt.srr) {
220 		if (!daddr)
221 			return -EINVAL;
222 		nexthop = inet_opt->opt.faddr;
223 	}
224 
225 	orig_sport = inet->inet_sport;
226 	orig_dport = usin->sin_port;
227 	fl4 = &inet->cork.fl.u.ip4;
228 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
229 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
230 			      IPPROTO_TCP,
231 			      orig_sport, orig_dport, sk);
232 	if (IS_ERR(rt)) {
233 		err = PTR_ERR(rt);
234 		if (err == -ENETUNREACH)
235 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
236 		return err;
237 	}
238 
239 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
240 		ip_rt_put(rt);
241 		return -ENETUNREACH;
242 	}
243 
244 	if (!inet_opt || !inet_opt->opt.srr)
245 		daddr = fl4->daddr;
246 
247 	if (!inet->inet_saddr)
248 		inet->inet_saddr = fl4->saddr;
249 	sk_rcv_saddr_set(sk, inet->inet_saddr);
250 
251 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
252 		/* Reset inherited state */
253 		tp->rx_opt.ts_recent	   = 0;
254 		tp->rx_opt.ts_recent_stamp = 0;
255 		if (likely(!tp->repair))
256 			WRITE_ONCE(tp->write_seq, 0);
257 	}
258 
259 	inet->inet_dport = usin->sin_port;
260 	sk_daddr_set(sk, daddr);
261 
262 	inet_csk(sk)->icsk_ext_hdr_len = 0;
263 	if (inet_opt)
264 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
265 
266 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
267 
268 	/* Socket identity is still unknown (sport may be zero).
269 	 * However we set state to SYN-SENT and not releasing socket
270 	 * lock select source port, enter ourselves into the hash tables and
271 	 * complete initialization after this.
272 	 */
273 	tcp_set_state(sk, TCP_SYN_SENT);
274 	err = inet_hash_connect(tcp_death_row, sk);
275 	if (err)
276 		goto failure;
277 
278 	sk_set_txhash(sk);
279 
280 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
281 			       inet->inet_sport, inet->inet_dport, sk);
282 	if (IS_ERR(rt)) {
283 		err = PTR_ERR(rt);
284 		rt = NULL;
285 		goto failure;
286 	}
287 	/* OK, now commit destination to socket.  */
288 	sk->sk_gso_type = SKB_GSO_TCPV4;
289 	sk_setup_caps(sk, &rt->dst);
290 	rt = NULL;
291 
292 	if (likely(!tp->repair)) {
293 		if (!tp->write_seq)
294 			WRITE_ONCE(tp->write_seq,
295 				   secure_tcp_seq(inet->inet_saddr,
296 						  inet->inet_daddr,
297 						  inet->inet_sport,
298 						  usin->sin_port));
299 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
300 						 inet->inet_saddr,
301 						 inet->inet_daddr);
302 	}
303 
304 	inet->inet_id = prandom_u32();
305 
306 	if (tcp_fastopen_defer_connect(sk, &err))
307 		return err;
308 	if (err)
309 		goto failure;
310 
311 	err = tcp_connect(sk);
312 
313 	if (err)
314 		goto failure;
315 
316 	return 0;
317 
318 failure:
319 	/*
320 	 * This unhashes the socket and releases the local port,
321 	 * if necessary.
322 	 */
323 	tcp_set_state(sk, TCP_CLOSE);
324 	ip_rt_put(rt);
325 	sk->sk_route_caps = 0;
326 	inet->inet_dport = 0;
327 	return err;
328 }
329 EXPORT_SYMBOL(tcp_v4_connect);
330 
331 /*
332  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
333  * It can be called through tcp_release_cb() if socket was owned by user
334  * at the time tcp_v4_err() was called to handle ICMP message.
335  */
336 void tcp_v4_mtu_reduced(struct sock *sk)
337 {
338 	struct inet_sock *inet = inet_sk(sk);
339 	struct dst_entry *dst;
340 	u32 mtu;
341 
342 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
343 		return;
344 	mtu = tcp_sk(sk)->mtu_info;
345 	dst = inet_csk_update_pmtu(sk, mtu);
346 	if (!dst)
347 		return;
348 
349 	/* Something is about to be wrong... Remember soft error
350 	 * for the case, if this connection will not able to recover.
351 	 */
352 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
353 		sk->sk_err_soft = EMSGSIZE;
354 
355 	mtu = dst_mtu(dst);
356 
357 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
358 	    ip_sk_accept_pmtu(sk) &&
359 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
360 		tcp_sync_mss(sk, mtu);
361 
362 		/* Resend the TCP packet because it's
363 		 * clear that the old packet has been
364 		 * dropped. This is the new "fast" path mtu
365 		 * discovery.
366 		 */
367 		tcp_simple_retransmit(sk);
368 	} /* else let the usual retransmit timer handle it */
369 }
370 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
371 
372 static void do_redirect(struct sk_buff *skb, struct sock *sk)
373 {
374 	struct dst_entry *dst = __sk_dst_check(sk, 0);
375 
376 	if (dst)
377 		dst->ops->redirect(dst, sk, skb);
378 }
379 
380 
381 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
382 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
383 {
384 	struct request_sock *req = inet_reqsk(sk);
385 	struct net *net = sock_net(sk);
386 
387 	/* ICMPs are not backlogged, hence we cannot get
388 	 * an established socket here.
389 	 */
390 	if (seq != tcp_rsk(req)->snt_isn) {
391 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
392 	} else if (abort) {
393 		/*
394 		 * Still in SYN_RECV, just remove it silently.
395 		 * There is no good way to pass the error to the newly
396 		 * created socket, and POSIX does not want network
397 		 * errors returned from accept().
398 		 */
399 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
400 		tcp_listendrop(req->rsk_listener);
401 	}
402 	reqsk_put(req);
403 }
404 EXPORT_SYMBOL(tcp_req_err);
405 
406 /* TCP-LD (RFC 6069) logic */
407 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
408 {
409 	struct inet_connection_sock *icsk = inet_csk(sk);
410 	struct tcp_sock *tp = tcp_sk(sk);
411 	struct sk_buff *skb;
412 	s32 remaining;
413 	u32 delta_us;
414 
415 	if (sock_owned_by_user(sk))
416 		return;
417 
418 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
419 	    !icsk->icsk_backoff)
420 		return;
421 
422 	skb = tcp_rtx_queue_head(sk);
423 	if (WARN_ON_ONCE(!skb))
424 		return;
425 
426 	icsk->icsk_backoff--;
427 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
428 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
429 
430 	tcp_mstamp_refresh(tp);
431 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
432 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
433 
434 	if (remaining > 0) {
435 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
436 					  remaining, TCP_RTO_MAX);
437 	} else {
438 		/* RTO revert clocked out retransmission.
439 		 * Will retransmit now.
440 		 */
441 		tcp_retransmit_timer(sk);
442 	}
443 }
444 EXPORT_SYMBOL(tcp_ld_RTO_revert);
445 
446 /*
447  * This routine is called by the ICMP module when it gets some
448  * sort of error condition.  If err < 0 then the socket should
449  * be closed and the error returned to the user.  If err > 0
450  * it's just the icmp type << 8 | icmp code.  After adjustment
451  * header points to the first 8 bytes of the tcp header.  We need
452  * to find the appropriate port.
453  *
454  * The locking strategy used here is very "optimistic". When
455  * someone else accesses the socket the ICMP is just dropped
456  * and for some paths there is no check at all.
457  * A more general error queue to queue errors for later handling
458  * is probably better.
459  *
460  */
461 
462 int tcp_v4_err(struct sk_buff *skb, u32 info)
463 {
464 	const struct iphdr *iph = (const struct iphdr *)skb->data;
465 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
466 	struct tcp_sock *tp;
467 	struct inet_sock *inet;
468 	const int type = icmp_hdr(skb)->type;
469 	const int code = icmp_hdr(skb)->code;
470 	struct sock *sk;
471 	struct request_sock *fastopen;
472 	u32 seq, snd_una;
473 	int err;
474 	struct net *net = dev_net(skb->dev);
475 
476 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
477 				       th->dest, iph->saddr, ntohs(th->source),
478 				       inet_iif(skb), 0);
479 	if (!sk) {
480 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
481 		return -ENOENT;
482 	}
483 	if (sk->sk_state == TCP_TIME_WAIT) {
484 		inet_twsk_put(inet_twsk(sk));
485 		return 0;
486 	}
487 	seq = ntohl(th->seq);
488 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
489 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
490 				     type == ICMP_TIME_EXCEEDED ||
491 				     (type == ICMP_DEST_UNREACH &&
492 				      (code == ICMP_NET_UNREACH ||
493 				       code == ICMP_HOST_UNREACH)));
494 		return 0;
495 	}
496 
497 	bh_lock_sock(sk);
498 	/* If too many ICMPs get dropped on busy
499 	 * servers this needs to be solved differently.
500 	 * We do take care of PMTU discovery (RFC1191) special case :
501 	 * we can receive locally generated ICMP messages while socket is held.
502 	 */
503 	if (sock_owned_by_user(sk)) {
504 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
505 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
506 	}
507 	if (sk->sk_state == TCP_CLOSE)
508 		goto out;
509 
510 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
511 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
512 		goto out;
513 	}
514 
515 	tp = tcp_sk(sk);
516 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
517 	fastopen = rcu_dereference(tp->fastopen_rsk);
518 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
519 	if (sk->sk_state != TCP_LISTEN &&
520 	    !between(seq, snd_una, tp->snd_nxt)) {
521 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
522 		goto out;
523 	}
524 
525 	switch (type) {
526 	case ICMP_REDIRECT:
527 		if (!sock_owned_by_user(sk))
528 			do_redirect(skb, sk);
529 		goto out;
530 	case ICMP_SOURCE_QUENCH:
531 		/* Just silently ignore these. */
532 		goto out;
533 	case ICMP_PARAMETERPROB:
534 		err = EPROTO;
535 		break;
536 	case ICMP_DEST_UNREACH:
537 		if (code > NR_ICMP_UNREACH)
538 			goto out;
539 
540 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
541 			/* We are not interested in TCP_LISTEN and open_requests
542 			 * (SYN-ACKs send out by Linux are always <576bytes so
543 			 * they should go through unfragmented).
544 			 */
545 			if (sk->sk_state == TCP_LISTEN)
546 				goto out;
547 
548 			tp->mtu_info = info;
549 			if (!sock_owned_by_user(sk)) {
550 				tcp_v4_mtu_reduced(sk);
551 			} else {
552 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
553 					sock_hold(sk);
554 			}
555 			goto out;
556 		}
557 
558 		err = icmp_err_convert[code].errno;
559 		/* check if this ICMP message allows revert of backoff.
560 		 * (see RFC 6069)
561 		 */
562 		if (!fastopen &&
563 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
564 			tcp_ld_RTO_revert(sk, seq);
565 		break;
566 	case ICMP_TIME_EXCEEDED:
567 		err = EHOSTUNREACH;
568 		break;
569 	default:
570 		goto out;
571 	}
572 
573 	switch (sk->sk_state) {
574 	case TCP_SYN_SENT:
575 	case TCP_SYN_RECV:
576 		/* Only in fast or simultaneous open. If a fast open socket is
577 		 * is already accepted it is treated as a connected one below.
578 		 */
579 		if (fastopen && !fastopen->sk)
580 			break;
581 
582 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
583 
584 		if (!sock_owned_by_user(sk)) {
585 			sk->sk_err = err;
586 
587 			sk->sk_error_report(sk);
588 
589 			tcp_done(sk);
590 		} else {
591 			sk->sk_err_soft = err;
592 		}
593 		goto out;
594 	}
595 
596 	/* If we've already connected we will keep trying
597 	 * until we time out, or the user gives up.
598 	 *
599 	 * rfc1122 4.2.3.9 allows to consider as hard errors
600 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
601 	 * but it is obsoleted by pmtu discovery).
602 	 *
603 	 * Note, that in modern internet, where routing is unreliable
604 	 * and in each dark corner broken firewalls sit, sending random
605 	 * errors ordered by their masters even this two messages finally lose
606 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
607 	 *
608 	 * Now we are in compliance with RFCs.
609 	 *							--ANK (980905)
610 	 */
611 
612 	inet = inet_sk(sk);
613 	if (!sock_owned_by_user(sk) && inet->recverr) {
614 		sk->sk_err = err;
615 		sk->sk_error_report(sk);
616 	} else	{ /* Only an error on timeout */
617 		sk->sk_err_soft = err;
618 	}
619 
620 out:
621 	bh_unlock_sock(sk);
622 	sock_put(sk);
623 	return 0;
624 }
625 
626 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
627 {
628 	struct tcphdr *th = tcp_hdr(skb);
629 
630 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
631 	skb->csum_start = skb_transport_header(skb) - skb->head;
632 	skb->csum_offset = offsetof(struct tcphdr, check);
633 }
634 
635 /* This routine computes an IPv4 TCP checksum. */
636 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
637 {
638 	const struct inet_sock *inet = inet_sk(sk);
639 
640 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
641 }
642 EXPORT_SYMBOL(tcp_v4_send_check);
643 
644 /*
645  *	This routine will send an RST to the other tcp.
646  *
647  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
648  *		      for reset.
649  *	Answer: if a packet caused RST, it is not for a socket
650  *		existing in our system, if it is matched to a socket,
651  *		it is just duplicate segment or bug in other side's TCP.
652  *		So that we build reply only basing on parameters
653  *		arrived with segment.
654  *	Exception: precedence violation. We do not implement it in any case.
655  */
656 
657 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
658 {
659 	const struct tcphdr *th = tcp_hdr(skb);
660 	struct {
661 		struct tcphdr th;
662 #ifdef CONFIG_TCP_MD5SIG
663 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
664 #endif
665 	} rep;
666 	struct ip_reply_arg arg;
667 #ifdef CONFIG_TCP_MD5SIG
668 	struct tcp_md5sig_key *key = NULL;
669 	const __u8 *hash_location = NULL;
670 	unsigned char newhash[16];
671 	int genhash;
672 	struct sock *sk1 = NULL;
673 #endif
674 	u64 transmit_time = 0;
675 	struct sock *ctl_sk;
676 	struct net *net;
677 
678 	/* Never send a reset in response to a reset. */
679 	if (th->rst)
680 		return;
681 
682 	/* If sk not NULL, it means we did a successful lookup and incoming
683 	 * route had to be correct. prequeue might have dropped our dst.
684 	 */
685 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
686 		return;
687 
688 	/* Swap the send and the receive. */
689 	memset(&rep, 0, sizeof(rep));
690 	rep.th.dest   = th->source;
691 	rep.th.source = th->dest;
692 	rep.th.doff   = sizeof(struct tcphdr) / 4;
693 	rep.th.rst    = 1;
694 
695 	if (th->ack) {
696 		rep.th.seq = th->ack_seq;
697 	} else {
698 		rep.th.ack = 1;
699 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
700 				       skb->len - (th->doff << 2));
701 	}
702 
703 	memset(&arg, 0, sizeof(arg));
704 	arg.iov[0].iov_base = (unsigned char *)&rep;
705 	arg.iov[0].iov_len  = sizeof(rep.th);
706 
707 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
708 #ifdef CONFIG_TCP_MD5SIG
709 	rcu_read_lock();
710 	hash_location = tcp_parse_md5sig_option(th);
711 	if (sk && sk_fullsock(sk)) {
712 		const union tcp_md5_addr *addr;
713 		int l3index;
714 
715 		/* sdif set, means packet ingressed via a device
716 		 * in an L3 domain and inet_iif is set to it.
717 		 */
718 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
719 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
720 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
721 	} else if (hash_location) {
722 		const union tcp_md5_addr *addr;
723 		int sdif = tcp_v4_sdif(skb);
724 		int dif = inet_iif(skb);
725 		int l3index;
726 
727 		/*
728 		 * active side is lost. Try to find listening socket through
729 		 * source port, and then find md5 key through listening socket.
730 		 * we are not loose security here:
731 		 * Incoming packet is checked with md5 hash with finding key,
732 		 * no RST generated if md5 hash doesn't match.
733 		 */
734 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
735 					     ip_hdr(skb)->saddr,
736 					     th->source, ip_hdr(skb)->daddr,
737 					     ntohs(th->source), dif, sdif);
738 		/* don't send rst if it can't find key */
739 		if (!sk1)
740 			goto out;
741 
742 		/* sdif set, means packet ingressed via a device
743 		 * in an L3 domain and dif is set to it.
744 		 */
745 		l3index = sdif ? dif : 0;
746 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
747 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
748 		if (!key)
749 			goto out;
750 
751 
752 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
753 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
754 			goto out;
755 
756 	}
757 
758 	if (key) {
759 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
760 				   (TCPOPT_NOP << 16) |
761 				   (TCPOPT_MD5SIG << 8) |
762 				   TCPOLEN_MD5SIG);
763 		/* Update length and the length the header thinks exists */
764 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
765 		rep.th.doff = arg.iov[0].iov_len / 4;
766 
767 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
768 				     key, ip_hdr(skb)->saddr,
769 				     ip_hdr(skb)->daddr, &rep.th);
770 	}
771 #endif
772 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
773 				      ip_hdr(skb)->saddr, /* XXX */
774 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
775 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
776 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
777 
778 	/* When socket is gone, all binding information is lost.
779 	 * routing might fail in this case. No choice here, if we choose to force
780 	 * input interface, we will misroute in case of asymmetric route.
781 	 */
782 	if (sk) {
783 		arg.bound_dev_if = sk->sk_bound_dev_if;
784 		if (sk_fullsock(sk))
785 			trace_tcp_send_reset(sk, skb);
786 	}
787 
788 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
789 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
790 
791 	arg.tos = ip_hdr(skb)->tos;
792 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
793 	local_bh_disable();
794 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
795 	if (sk) {
796 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
797 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
798 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
799 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
800 		transmit_time = tcp_transmit_time(sk);
801 	}
802 	ip_send_unicast_reply(ctl_sk,
803 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
804 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
805 			      &arg, arg.iov[0].iov_len,
806 			      transmit_time);
807 
808 	ctl_sk->sk_mark = 0;
809 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
810 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
811 	local_bh_enable();
812 
813 #ifdef CONFIG_TCP_MD5SIG
814 out:
815 	rcu_read_unlock();
816 #endif
817 }
818 
819 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
820    outside socket context is ugly, certainly. What can I do?
821  */
822 
823 static void tcp_v4_send_ack(const struct sock *sk,
824 			    struct sk_buff *skb, u32 seq, u32 ack,
825 			    u32 win, u32 tsval, u32 tsecr, int oif,
826 			    struct tcp_md5sig_key *key,
827 			    int reply_flags, u8 tos)
828 {
829 	const struct tcphdr *th = tcp_hdr(skb);
830 	struct {
831 		struct tcphdr th;
832 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
833 #ifdef CONFIG_TCP_MD5SIG
834 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
835 #endif
836 			];
837 	} rep;
838 	struct net *net = sock_net(sk);
839 	struct ip_reply_arg arg;
840 	struct sock *ctl_sk;
841 	u64 transmit_time;
842 
843 	memset(&rep.th, 0, sizeof(struct tcphdr));
844 	memset(&arg, 0, sizeof(arg));
845 
846 	arg.iov[0].iov_base = (unsigned char *)&rep;
847 	arg.iov[0].iov_len  = sizeof(rep.th);
848 	if (tsecr) {
849 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
850 				   (TCPOPT_TIMESTAMP << 8) |
851 				   TCPOLEN_TIMESTAMP);
852 		rep.opt[1] = htonl(tsval);
853 		rep.opt[2] = htonl(tsecr);
854 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
855 	}
856 
857 	/* Swap the send and the receive. */
858 	rep.th.dest    = th->source;
859 	rep.th.source  = th->dest;
860 	rep.th.doff    = arg.iov[0].iov_len / 4;
861 	rep.th.seq     = htonl(seq);
862 	rep.th.ack_seq = htonl(ack);
863 	rep.th.ack     = 1;
864 	rep.th.window  = htons(win);
865 
866 #ifdef CONFIG_TCP_MD5SIG
867 	if (key) {
868 		int offset = (tsecr) ? 3 : 0;
869 
870 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
871 					  (TCPOPT_NOP << 16) |
872 					  (TCPOPT_MD5SIG << 8) |
873 					  TCPOLEN_MD5SIG);
874 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
875 		rep.th.doff = arg.iov[0].iov_len/4;
876 
877 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
878 				    key, ip_hdr(skb)->saddr,
879 				    ip_hdr(skb)->daddr, &rep.th);
880 	}
881 #endif
882 	arg.flags = reply_flags;
883 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
884 				      ip_hdr(skb)->saddr, /* XXX */
885 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
886 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
887 	if (oif)
888 		arg.bound_dev_if = oif;
889 	arg.tos = tos;
890 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
891 	local_bh_disable();
892 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
893 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
894 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
895 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
896 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
897 	transmit_time = tcp_transmit_time(sk);
898 	ip_send_unicast_reply(ctl_sk,
899 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
900 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
901 			      &arg, arg.iov[0].iov_len,
902 			      transmit_time);
903 
904 	ctl_sk->sk_mark = 0;
905 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
906 	local_bh_enable();
907 }
908 
909 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
910 {
911 	struct inet_timewait_sock *tw = inet_twsk(sk);
912 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
913 
914 	tcp_v4_send_ack(sk, skb,
915 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
916 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
917 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
918 			tcptw->tw_ts_recent,
919 			tw->tw_bound_dev_if,
920 			tcp_twsk_md5_key(tcptw),
921 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
922 			tw->tw_tos
923 			);
924 
925 	inet_twsk_put(tw);
926 }
927 
928 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
929 				  struct request_sock *req)
930 {
931 	const union tcp_md5_addr *addr;
932 	int l3index;
933 
934 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
935 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
936 	 */
937 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
938 					     tcp_sk(sk)->snd_nxt;
939 
940 	/* RFC 7323 2.3
941 	 * The window field (SEG.WND) of every outgoing segment, with the
942 	 * exception of <SYN> segments, MUST be right-shifted by
943 	 * Rcv.Wind.Shift bits:
944 	 */
945 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
946 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
947 	tcp_v4_send_ack(sk, skb, seq,
948 			tcp_rsk(req)->rcv_nxt,
949 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
950 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
951 			req->ts_recent,
952 			0,
953 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
954 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
955 			ip_hdr(skb)->tos);
956 }
957 
958 /*
959  *	Send a SYN-ACK after having received a SYN.
960  *	This still operates on a request_sock only, not on a big
961  *	socket.
962  */
963 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
964 			      struct flowi *fl,
965 			      struct request_sock *req,
966 			      struct tcp_fastopen_cookie *foc,
967 			      enum tcp_synack_type synack_type)
968 {
969 	const struct inet_request_sock *ireq = inet_rsk(req);
970 	struct flowi4 fl4;
971 	int err = -1;
972 	struct sk_buff *skb;
973 
974 	/* First, grab a route. */
975 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
976 		return -1;
977 
978 	skb = tcp_make_synack(sk, dst, req, foc, synack_type);
979 
980 	if (skb) {
981 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
982 
983 		rcu_read_lock();
984 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
985 					    ireq->ir_rmt_addr,
986 					    rcu_dereference(ireq->ireq_opt));
987 		rcu_read_unlock();
988 		err = net_xmit_eval(err);
989 	}
990 
991 	return err;
992 }
993 
994 /*
995  *	IPv4 request_sock destructor.
996  */
997 static void tcp_v4_reqsk_destructor(struct request_sock *req)
998 {
999 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1000 }
1001 
1002 #ifdef CONFIG_TCP_MD5SIG
1003 /*
1004  * RFC2385 MD5 checksumming requires a mapping of
1005  * IP address->MD5 Key.
1006  * We need to maintain these in the sk structure.
1007  */
1008 
1009 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1010 EXPORT_SYMBOL(tcp_md5_needed);
1011 
1012 /* Find the Key structure for an address.  */
1013 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1014 					   const union tcp_md5_addr *addr,
1015 					   int family)
1016 {
1017 	const struct tcp_sock *tp = tcp_sk(sk);
1018 	struct tcp_md5sig_key *key;
1019 	const struct tcp_md5sig_info *md5sig;
1020 	__be32 mask;
1021 	struct tcp_md5sig_key *best_match = NULL;
1022 	bool match;
1023 
1024 	/* caller either holds rcu_read_lock() or socket lock */
1025 	md5sig = rcu_dereference_check(tp->md5sig_info,
1026 				       lockdep_sock_is_held(sk));
1027 	if (!md5sig)
1028 		return NULL;
1029 
1030 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1031 				 lockdep_sock_is_held(sk)) {
1032 		if (key->family != family)
1033 			continue;
1034 		if (key->l3index && key->l3index != l3index)
1035 			continue;
1036 		if (family == AF_INET) {
1037 			mask = inet_make_mask(key->prefixlen);
1038 			match = (key->addr.a4.s_addr & mask) ==
1039 				(addr->a4.s_addr & mask);
1040 #if IS_ENABLED(CONFIG_IPV6)
1041 		} else if (family == AF_INET6) {
1042 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1043 						  key->prefixlen);
1044 #endif
1045 		} else {
1046 			match = false;
1047 		}
1048 
1049 		if (match && (!best_match ||
1050 			      key->prefixlen > best_match->prefixlen))
1051 			best_match = key;
1052 	}
1053 	return best_match;
1054 }
1055 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1056 
1057 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1058 						      const union tcp_md5_addr *addr,
1059 						      int family, u8 prefixlen,
1060 						      int l3index)
1061 {
1062 	const struct tcp_sock *tp = tcp_sk(sk);
1063 	struct tcp_md5sig_key *key;
1064 	unsigned int size = sizeof(struct in_addr);
1065 	const struct tcp_md5sig_info *md5sig;
1066 
1067 	/* caller either holds rcu_read_lock() or socket lock */
1068 	md5sig = rcu_dereference_check(tp->md5sig_info,
1069 				       lockdep_sock_is_held(sk));
1070 	if (!md5sig)
1071 		return NULL;
1072 #if IS_ENABLED(CONFIG_IPV6)
1073 	if (family == AF_INET6)
1074 		size = sizeof(struct in6_addr);
1075 #endif
1076 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1077 				 lockdep_sock_is_held(sk)) {
1078 		if (key->family != family)
1079 			continue;
1080 		if (key->l3index && key->l3index != l3index)
1081 			continue;
1082 		if (!memcmp(&key->addr, addr, size) &&
1083 		    key->prefixlen == prefixlen)
1084 			return key;
1085 	}
1086 	return NULL;
1087 }
1088 
1089 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1090 					 const struct sock *addr_sk)
1091 {
1092 	const union tcp_md5_addr *addr;
1093 	int l3index;
1094 
1095 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1096 						 addr_sk->sk_bound_dev_if);
1097 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1098 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1099 }
1100 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1101 
1102 /* This can be called on a newly created socket, from other files */
1103 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1104 		   int family, u8 prefixlen, int l3index,
1105 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1106 {
1107 	/* Add Key to the list */
1108 	struct tcp_md5sig_key *key;
1109 	struct tcp_sock *tp = tcp_sk(sk);
1110 	struct tcp_md5sig_info *md5sig;
1111 
1112 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1113 	if (key) {
1114 		/* Pre-existing entry - just update that one.
1115 		 * Note that the key might be used concurrently.
1116 		 * data_race() is telling kcsan that we do not care of
1117 		 * key mismatches, since changing MD5 key on live flows
1118 		 * can lead to packet drops.
1119 		 */
1120 		data_race(memcpy(key->key, newkey, newkeylen));
1121 
1122 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1123 		 * Also note that a reader could catch new key->keylen value
1124 		 * but old key->key[], this is the reason we use __GFP_ZERO
1125 		 * at sock_kmalloc() time below these lines.
1126 		 */
1127 		WRITE_ONCE(key->keylen, newkeylen);
1128 
1129 		return 0;
1130 	}
1131 
1132 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1133 					   lockdep_sock_is_held(sk));
1134 	if (!md5sig) {
1135 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1136 		if (!md5sig)
1137 			return -ENOMEM;
1138 
1139 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1140 		INIT_HLIST_HEAD(&md5sig->head);
1141 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1142 	}
1143 
1144 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1145 	if (!key)
1146 		return -ENOMEM;
1147 	if (!tcp_alloc_md5sig_pool()) {
1148 		sock_kfree_s(sk, key, sizeof(*key));
1149 		return -ENOMEM;
1150 	}
1151 
1152 	memcpy(key->key, newkey, newkeylen);
1153 	key->keylen = newkeylen;
1154 	key->family = family;
1155 	key->prefixlen = prefixlen;
1156 	key->l3index = l3index;
1157 	memcpy(&key->addr, addr,
1158 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1159 				      sizeof(struct in_addr));
1160 	hlist_add_head_rcu(&key->node, &md5sig->head);
1161 	return 0;
1162 }
1163 EXPORT_SYMBOL(tcp_md5_do_add);
1164 
1165 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1166 		   u8 prefixlen, int l3index)
1167 {
1168 	struct tcp_md5sig_key *key;
1169 
1170 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1171 	if (!key)
1172 		return -ENOENT;
1173 	hlist_del_rcu(&key->node);
1174 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1175 	kfree_rcu(key, rcu);
1176 	return 0;
1177 }
1178 EXPORT_SYMBOL(tcp_md5_do_del);
1179 
1180 static void tcp_clear_md5_list(struct sock *sk)
1181 {
1182 	struct tcp_sock *tp = tcp_sk(sk);
1183 	struct tcp_md5sig_key *key;
1184 	struct hlist_node *n;
1185 	struct tcp_md5sig_info *md5sig;
1186 
1187 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1188 
1189 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1190 		hlist_del_rcu(&key->node);
1191 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1192 		kfree_rcu(key, rcu);
1193 	}
1194 }
1195 
1196 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1197 				 char __user *optval, int optlen)
1198 {
1199 	struct tcp_md5sig cmd;
1200 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1201 	const union tcp_md5_addr *addr;
1202 	u8 prefixlen = 32;
1203 	int l3index = 0;
1204 
1205 	if (optlen < sizeof(cmd))
1206 		return -EINVAL;
1207 
1208 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1209 		return -EFAULT;
1210 
1211 	if (sin->sin_family != AF_INET)
1212 		return -EINVAL;
1213 
1214 	if (optname == TCP_MD5SIG_EXT &&
1215 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1216 		prefixlen = cmd.tcpm_prefixlen;
1217 		if (prefixlen > 32)
1218 			return -EINVAL;
1219 	}
1220 
1221 	if (optname == TCP_MD5SIG_EXT &&
1222 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1223 		struct net_device *dev;
1224 
1225 		rcu_read_lock();
1226 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1227 		if (dev && netif_is_l3_master(dev))
1228 			l3index = dev->ifindex;
1229 
1230 		rcu_read_unlock();
1231 
1232 		/* ok to reference set/not set outside of rcu;
1233 		 * right now device MUST be an L3 master
1234 		 */
1235 		if (!dev || !l3index)
1236 			return -EINVAL;
1237 	}
1238 
1239 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1240 
1241 	if (!cmd.tcpm_keylen)
1242 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1243 
1244 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1245 		return -EINVAL;
1246 
1247 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1248 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1249 }
1250 
1251 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1252 				   __be32 daddr, __be32 saddr,
1253 				   const struct tcphdr *th, int nbytes)
1254 {
1255 	struct tcp4_pseudohdr *bp;
1256 	struct scatterlist sg;
1257 	struct tcphdr *_th;
1258 
1259 	bp = hp->scratch;
1260 	bp->saddr = saddr;
1261 	bp->daddr = daddr;
1262 	bp->pad = 0;
1263 	bp->protocol = IPPROTO_TCP;
1264 	bp->len = cpu_to_be16(nbytes);
1265 
1266 	_th = (struct tcphdr *)(bp + 1);
1267 	memcpy(_th, th, sizeof(*th));
1268 	_th->check = 0;
1269 
1270 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1271 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1272 				sizeof(*bp) + sizeof(*th));
1273 	return crypto_ahash_update(hp->md5_req);
1274 }
1275 
1276 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1277 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1278 {
1279 	struct tcp_md5sig_pool *hp;
1280 	struct ahash_request *req;
1281 
1282 	hp = tcp_get_md5sig_pool();
1283 	if (!hp)
1284 		goto clear_hash_noput;
1285 	req = hp->md5_req;
1286 
1287 	if (crypto_ahash_init(req))
1288 		goto clear_hash;
1289 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1290 		goto clear_hash;
1291 	if (tcp_md5_hash_key(hp, key))
1292 		goto clear_hash;
1293 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1294 	if (crypto_ahash_final(req))
1295 		goto clear_hash;
1296 
1297 	tcp_put_md5sig_pool();
1298 	return 0;
1299 
1300 clear_hash:
1301 	tcp_put_md5sig_pool();
1302 clear_hash_noput:
1303 	memset(md5_hash, 0, 16);
1304 	return 1;
1305 }
1306 
1307 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1308 			const struct sock *sk,
1309 			const struct sk_buff *skb)
1310 {
1311 	struct tcp_md5sig_pool *hp;
1312 	struct ahash_request *req;
1313 	const struct tcphdr *th = tcp_hdr(skb);
1314 	__be32 saddr, daddr;
1315 
1316 	if (sk) { /* valid for establish/request sockets */
1317 		saddr = sk->sk_rcv_saddr;
1318 		daddr = sk->sk_daddr;
1319 	} else {
1320 		const struct iphdr *iph = ip_hdr(skb);
1321 		saddr = iph->saddr;
1322 		daddr = iph->daddr;
1323 	}
1324 
1325 	hp = tcp_get_md5sig_pool();
1326 	if (!hp)
1327 		goto clear_hash_noput;
1328 	req = hp->md5_req;
1329 
1330 	if (crypto_ahash_init(req))
1331 		goto clear_hash;
1332 
1333 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1334 		goto clear_hash;
1335 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1336 		goto clear_hash;
1337 	if (tcp_md5_hash_key(hp, key))
1338 		goto clear_hash;
1339 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1340 	if (crypto_ahash_final(req))
1341 		goto clear_hash;
1342 
1343 	tcp_put_md5sig_pool();
1344 	return 0;
1345 
1346 clear_hash:
1347 	tcp_put_md5sig_pool();
1348 clear_hash_noput:
1349 	memset(md5_hash, 0, 16);
1350 	return 1;
1351 }
1352 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1353 
1354 #endif
1355 
1356 /* Called with rcu_read_lock() */
1357 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1358 				    const struct sk_buff *skb,
1359 				    int dif, int sdif)
1360 {
1361 #ifdef CONFIG_TCP_MD5SIG
1362 	/*
1363 	 * This gets called for each TCP segment that arrives
1364 	 * so we want to be efficient.
1365 	 * We have 3 drop cases:
1366 	 * o No MD5 hash and one expected.
1367 	 * o MD5 hash and we're not expecting one.
1368 	 * o MD5 hash and its wrong.
1369 	 */
1370 	const __u8 *hash_location = NULL;
1371 	struct tcp_md5sig_key *hash_expected;
1372 	const struct iphdr *iph = ip_hdr(skb);
1373 	const struct tcphdr *th = tcp_hdr(skb);
1374 	const union tcp_md5_addr *addr;
1375 	unsigned char newhash[16];
1376 	int genhash, l3index;
1377 
1378 	/* sdif set, means packet ingressed via a device
1379 	 * in an L3 domain and dif is set to the l3mdev
1380 	 */
1381 	l3index = sdif ? dif : 0;
1382 
1383 	addr = (union tcp_md5_addr *)&iph->saddr;
1384 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1385 	hash_location = tcp_parse_md5sig_option(th);
1386 
1387 	/* We've parsed the options - do we have a hash? */
1388 	if (!hash_expected && !hash_location)
1389 		return false;
1390 
1391 	if (hash_expected && !hash_location) {
1392 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1393 		return true;
1394 	}
1395 
1396 	if (!hash_expected && hash_location) {
1397 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1398 		return true;
1399 	}
1400 
1401 	/* Okay, so this is hash_expected and hash_location -
1402 	 * so we need to calculate the checksum.
1403 	 */
1404 	genhash = tcp_v4_md5_hash_skb(newhash,
1405 				      hash_expected,
1406 				      NULL, skb);
1407 
1408 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1409 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1410 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1411 				     &iph->saddr, ntohs(th->source),
1412 				     &iph->daddr, ntohs(th->dest),
1413 				     genhash ? " tcp_v4_calc_md5_hash failed"
1414 				     : "", l3index);
1415 		return true;
1416 	}
1417 	return false;
1418 #endif
1419 	return false;
1420 }
1421 
1422 static void tcp_v4_init_req(struct request_sock *req,
1423 			    const struct sock *sk_listener,
1424 			    struct sk_buff *skb)
1425 {
1426 	struct inet_request_sock *ireq = inet_rsk(req);
1427 	struct net *net = sock_net(sk_listener);
1428 
1429 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1430 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1431 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1432 }
1433 
1434 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1435 					  struct flowi *fl,
1436 					  const struct request_sock *req)
1437 {
1438 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1439 }
1440 
1441 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1442 	.family		=	PF_INET,
1443 	.obj_size	=	sizeof(struct tcp_request_sock),
1444 	.rtx_syn_ack	=	tcp_rtx_synack,
1445 	.send_ack	=	tcp_v4_reqsk_send_ack,
1446 	.destructor	=	tcp_v4_reqsk_destructor,
1447 	.send_reset	=	tcp_v4_send_reset,
1448 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1449 };
1450 
1451 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1452 	.mss_clamp	=	TCP_MSS_DEFAULT,
1453 #ifdef CONFIG_TCP_MD5SIG
1454 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1455 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1456 #endif
1457 	.init_req	=	tcp_v4_init_req,
1458 #ifdef CONFIG_SYN_COOKIES
1459 	.cookie_init_seq =	cookie_v4_init_sequence,
1460 #endif
1461 	.route_req	=	tcp_v4_route_req,
1462 	.init_seq	=	tcp_v4_init_seq,
1463 	.init_ts_off	=	tcp_v4_init_ts_off,
1464 	.send_synack	=	tcp_v4_send_synack,
1465 };
1466 
1467 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1468 {
1469 	/* Never answer to SYNs send to broadcast or multicast */
1470 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1471 		goto drop;
1472 
1473 	return tcp_conn_request(&tcp_request_sock_ops,
1474 				&tcp_request_sock_ipv4_ops, sk, skb);
1475 
1476 drop:
1477 	tcp_listendrop(sk);
1478 	return 0;
1479 }
1480 EXPORT_SYMBOL(tcp_v4_conn_request);
1481 
1482 
1483 /*
1484  * The three way handshake has completed - we got a valid synack -
1485  * now create the new socket.
1486  */
1487 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1488 				  struct request_sock *req,
1489 				  struct dst_entry *dst,
1490 				  struct request_sock *req_unhash,
1491 				  bool *own_req)
1492 {
1493 	struct inet_request_sock *ireq;
1494 	struct inet_sock *newinet;
1495 	struct tcp_sock *newtp;
1496 	struct sock *newsk;
1497 #ifdef CONFIG_TCP_MD5SIG
1498 	const union tcp_md5_addr *addr;
1499 	struct tcp_md5sig_key *key;
1500 	int l3index;
1501 #endif
1502 	struct ip_options_rcu *inet_opt;
1503 
1504 	if (sk_acceptq_is_full(sk))
1505 		goto exit_overflow;
1506 
1507 	newsk = tcp_create_openreq_child(sk, req, skb);
1508 	if (!newsk)
1509 		goto exit_nonewsk;
1510 
1511 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1512 	inet_sk_rx_dst_set(newsk, skb);
1513 
1514 	newtp		      = tcp_sk(newsk);
1515 	newinet		      = inet_sk(newsk);
1516 	ireq		      = inet_rsk(req);
1517 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1518 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1519 	newsk->sk_bound_dev_if = ireq->ir_iif;
1520 	newinet->inet_saddr   = ireq->ir_loc_addr;
1521 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1522 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1523 	newinet->mc_index     = inet_iif(skb);
1524 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1525 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1526 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1527 	if (inet_opt)
1528 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1529 	newinet->inet_id = prandom_u32();
1530 
1531 	if (!dst) {
1532 		dst = inet_csk_route_child_sock(sk, newsk, req);
1533 		if (!dst)
1534 			goto put_and_exit;
1535 	} else {
1536 		/* syncookie case : see end of cookie_v4_check() */
1537 	}
1538 	sk_setup_caps(newsk, dst);
1539 
1540 	tcp_ca_openreq_child(newsk, dst);
1541 
1542 	tcp_sync_mss(newsk, dst_mtu(dst));
1543 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1544 
1545 	tcp_initialize_rcv_mss(newsk);
1546 
1547 #ifdef CONFIG_TCP_MD5SIG
1548 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1549 	/* Copy over the MD5 key from the original socket */
1550 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1551 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1552 	if (key) {
1553 		/*
1554 		 * We're using one, so create a matching key
1555 		 * on the newsk structure. If we fail to get
1556 		 * memory, then we end up not copying the key
1557 		 * across. Shucks.
1558 		 */
1559 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1560 			       key->key, key->keylen, GFP_ATOMIC);
1561 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1562 	}
1563 #endif
1564 
1565 	if (__inet_inherit_port(sk, newsk) < 0)
1566 		goto put_and_exit;
1567 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1568 	if (likely(*own_req)) {
1569 		tcp_move_syn(newtp, req);
1570 		ireq->ireq_opt = NULL;
1571 	} else {
1572 		newinet->inet_opt = NULL;
1573 	}
1574 	return newsk;
1575 
1576 exit_overflow:
1577 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1578 exit_nonewsk:
1579 	dst_release(dst);
1580 exit:
1581 	tcp_listendrop(sk);
1582 	return NULL;
1583 put_and_exit:
1584 	newinet->inet_opt = NULL;
1585 	inet_csk_prepare_forced_close(newsk);
1586 	tcp_done(newsk);
1587 	goto exit;
1588 }
1589 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1590 
1591 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1592 {
1593 #ifdef CONFIG_SYN_COOKIES
1594 	const struct tcphdr *th = tcp_hdr(skb);
1595 
1596 	if (!th->syn)
1597 		sk = cookie_v4_check(sk, skb);
1598 #endif
1599 	return sk;
1600 }
1601 
1602 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1603 			 struct tcphdr *th, u32 *cookie)
1604 {
1605 	u16 mss = 0;
1606 #ifdef CONFIG_SYN_COOKIES
1607 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1608 				    &tcp_request_sock_ipv4_ops, sk, th);
1609 	if (mss) {
1610 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1611 		tcp_synq_overflow(sk);
1612 	}
1613 #endif
1614 	return mss;
1615 }
1616 
1617 /* The socket must have it's spinlock held when we get
1618  * here, unless it is a TCP_LISTEN socket.
1619  *
1620  * We have a potential double-lock case here, so even when
1621  * doing backlog processing we use the BH locking scheme.
1622  * This is because we cannot sleep with the original spinlock
1623  * held.
1624  */
1625 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1626 {
1627 	struct sock *rsk;
1628 
1629 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1630 		struct dst_entry *dst = sk->sk_rx_dst;
1631 
1632 		sock_rps_save_rxhash(sk, skb);
1633 		sk_mark_napi_id(sk, skb);
1634 		if (dst) {
1635 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1636 			    !dst->ops->check(dst, 0)) {
1637 				dst_release(dst);
1638 				sk->sk_rx_dst = NULL;
1639 			}
1640 		}
1641 		tcp_rcv_established(sk, skb);
1642 		return 0;
1643 	}
1644 
1645 	if (tcp_checksum_complete(skb))
1646 		goto csum_err;
1647 
1648 	if (sk->sk_state == TCP_LISTEN) {
1649 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1650 
1651 		if (!nsk)
1652 			goto discard;
1653 		if (nsk != sk) {
1654 			if (tcp_child_process(sk, nsk, skb)) {
1655 				rsk = nsk;
1656 				goto reset;
1657 			}
1658 			return 0;
1659 		}
1660 	} else
1661 		sock_rps_save_rxhash(sk, skb);
1662 
1663 	if (tcp_rcv_state_process(sk, skb)) {
1664 		rsk = sk;
1665 		goto reset;
1666 	}
1667 	return 0;
1668 
1669 reset:
1670 	tcp_v4_send_reset(rsk, skb);
1671 discard:
1672 	kfree_skb(skb);
1673 	/* Be careful here. If this function gets more complicated and
1674 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1675 	 * might be destroyed here. This current version compiles correctly,
1676 	 * but you have been warned.
1677 	 */
1678 	return 0;
1679 
1680 csum_err:
1681 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1682 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1683 	goto discard;
1684 }
1685 EXPORT_SYMBOL(tcp_v4_do_rcv);
1686 
1687 int tcp_v4_early_demux(struct sk_buff *skb)
1688 {
1689 	const struct iphdr *iph;
1690 	const struct tcphdr *th;
1691 	struct sock *sk;
1692 
1693 	if (skb->pkt_type != PACKET_HOST)
1694 		return 0;
1695 
1696 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1697 		return 0;
1698 
1699 	iph = ip_hdr(skb);
1700 	th = tcp_hdr(skb);
1701 
1702 	if (th->doff < sizeof(struct tcphdr) / 4)
1703 		return 0;
1704 
1705 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1706 				       iph->saddr, th->source,
1707 				       iph->daddr, ntohs(th->dest),
1708 				       skb->skb_iif, inet_sdif(skb));
1709 	if (sk) {
1710 		skb->sk = sk;
1711 		skb->destructor = sock_edemux;
1712 		if (sk_fullsock(sk)) {
1713 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1714 
1715 			if (dst)
1716 				dst = dst_check(dst, 0);
1717 			if (dst &&
1718 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1719 				skb_dst_set_noref(skb, dst);
1720 		}
1721 	}
1722 	return 0;
1723 }
1724 
1725 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1726 {
1727 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1728 	struct skb_shared_info *shinfo;
1729 	const struct tcphdr *th;
1730 	struct tcphdr *thtail;
1731 	struct sk_buff *tail;
1732 	unsigned int hdrlen;
1733 	bool fragstolen;
1734 	u32 gso_segs;
1735 	int delta;
1736 
1737 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1738 	 * we can fix skb->truesize to its real value to avoid future drops.
1739 	 * This is valid because skb is not yet charged to the socket.
1740 	 * It has been noticed pure SACK packets were sometimes dropped
1741 	 * (if cooked by drivers without copybreak feature).
1742 	 */
1743 	skb_condense(skb);
1744 
1745 	skb_dst_drop(skb);
1746 
1747 	if (unlikely(tcp_checksum_complete(skb))) {
1748 		bh_unlock_sock(sk);
1749 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1750 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1751 		return true;
1752 	}
1753 
1754 	/* Attempt coalescing to last skb in backlog, even if we are
1755 	 * above the limits.
1756 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1757 	 */
1758 	th = (const struct tcphdr *)skb->data;
1759 	hdrlen = th->doff * 4;
1760 	shinfo = skb_shinfo(skb);
1761 
1762 	if (!shinfo->gso_size)
1763 		shinfo->gso_size = skb->len - hdrlen;
1764 
1765 	if (!shinfo->gso_segs)
1766 		shinfo->gso_segs = 1;
1767 
1768 	tail = sk->sk_backlog.tail;
1769 	if (!tail)
1770 		goto no_coalesce;
1771 	thtail = (struct tcphdr *)tail->data;
1772 
1773 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1774 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1775 	    ((TCP_SKB_CB(tail)->tcp_flags |
1776 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1777 	    !((TCP_SKB_CB(tail)->tcp_flags &
1778 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1779 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1780 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1781 #ifdef CONFIG_TLS_DEVICE
1782 	    tail->decrypted != skb->decrypted ||
1783 #endif
1784 	    thtail->doff != th->doff ||
1785 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1786 		goto no_coalesce;
1787 
1788 	__skb_pull(skb, hdrlen);
1789 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1790 		thtail->window = th->window;
1791 
1792 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1793 
1794 		if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1795 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1796 
1797 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1798 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1799 		 * is not entered if we append a packet with a FIN.
1800 		 * SYN, RST, URG are not present.
1801 		 * ACK is set on both packets.
1802 		 * PSH : we do not really care in TCP stack,
1803 		 *       at least for 'GRO' packets.
1804 		 */
1805 		thtail->fin |= th->fin;
1806 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1807 
1808 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1809 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1810 			tail->tstamp = skb->tstamp;
1811 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1812 		}
1813 
1814 		/* Not as strict as GRO. We only need to carry mss max value */
1815 		skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1816 						 skb_shinfo(tail)->gso_size);
1817 
1818 		gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1819 		skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1820 
1821 		sk->sk_backlog.len += delta;
1822 		__NET_INC_STATS(sock_net(sk),
1823 				LINUX_MIB_TCPBACKLOGCOALESCE);
1824 		kfree_skb_partial(skb, fragstolen);
1825 		return false;
1826 	}
1827 	__skb_push(skb, hdrlen);
1828 
1829 no_coalesce:
1830 	/* Only socket owner can try to collapse/prune rx queues
1831 	 * to reduce memory overhead, so add a little headroom here.
1832 	 * Few sockets backlog are possibly concurrently non empty.
1833 	 */
1834 	limit += 64*1024;
1835 
1836 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1837 		bh_unlock_sock(sk);
1838 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1839 		return true;
1840 	}
1841 	return false;
1842 }
1843 EXPORT_SYMBOL(tcp_add_backlog);
1844 
1845 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1846 {
1847 	struct tcphdr *th = (struct tcphdr *)skb->data;
1848 
1849 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1850 }
1851 EXPORT_SYMBOL(tcp_filter);
1852 
1853 static void tcp_v4_restore_cb(struct sk_buff *skb)
1854 {
1855 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1856 		sizeof(struct inet_skb_parm));
1857 }
1858 
1859 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1860 			   const struct tcphdr *th)
1861 {
1862 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1863 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1864 	 */
1865 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1866 		sizeof(struct inet_skb_parm));
1867 	barrier();
1868 
1869 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1870 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1871 				    skb->len - th->doff * 4);
1872 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1873 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1874 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1875 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1876 	TCP_SKB_CB(skb)->sacked	 = 0;
1877 	TCP_SKB_CB(skb)->has_rxtstamp =
1878 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1879 }
1880 
1881 /*
1882  *	From tcp_input.c
1883  */
1884 
1885 int tcp_v4_rcv(struct sk_buff *skb)
1886 {
1887 	struct net *net = dev_net(skb->dev);
1888 	struct sk_buff *skb_to_free;
1889 	int sdif = inet_sdif(skb);
1890 	int dif = inet_iif(skb);
1891 	const struct iphdr *iph;
1892 	const struct tcphdr *th;
1893 	bool refcounted;
1894 	struct sock *sk;
1895 	int ret;
1896 
1897 	if (skb->pkt_type != PACKET_HOST)
1898 		goto discard_it;
1899 
1900 	/* Count it even if it's bad */
1901 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1902 
1903 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1904 		goto discard_it;
1905 
1906 	th = (const struct tcphdr *)skb->data;
1907 
1908 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1909 		goto bad_packet;
1910 	if (!pskb_may_pull(skb, th->doff * 4))
1911 		goto discard_it;
1912 
1913 	/* An explanation is required here, I think.
1914 	 * Packet length and doff are validated by header prediction,
1915 	 * provided case of th->doff==0 is eliminated.
1916 	 * So, we defer the checks. */
1917 
1918 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1919 		goto csum_error;
1920 
1921 	th = (const struct tcphdr *)skb->data;
1922 	iph = ip_hdr(skb);
1923 lookup:
1924 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1925 			       th->dest, sdif, &refcounted);
1926 	if (!sk)
1927 		goto no_tcp_socket;
1928 
1929 process:
1930 	if (sk->sk_state == TCP_TIME_WAIT)
1931 		goto do_time_wait;
1932 
1933 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1934 		struct request_sock *req = inet_reqsk(sk);
1935 		bool req_stolen = false;
1936 		struct sock *nsk;
1937 
1938 		sk = req->rsk_listener;
1939 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1940 			sk_drops_add(sk, skb);
1941 			reqsk_put(req);
1942 			goto discard_it;
1943 		}
1944 		if (tcp_checksum_complete(skb)) {
1945 			reqsk_put(req);
1946 			goto csum_error;
1947 		}
1948 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
1949 			inet_csk_reqsk_queue_drop_and_put(sk, req);
1950 			goto lookup;
1951 		}
1952 		/* We own a reference on the listener, increase it again
1953 		 * as we might lose it too soon.
1954 		 */
1955 		sock_hold(sk);
1956 		refcounted = true;
1957 		nsk = NULL;
1958 		if (!tcp_filter(sk, skb)) {
1959 			th = (const struct tcphdr *)skb->data;
1960 			iph = ip_hdr(skb);
1961 			tcp_v4_fill_cb(skb, iph, th);
1962 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1963 		}
1964 		if (!nsk) {
1965 			reqsk_put(req);
1966 			if (req_stolen) {
1967 				/* Another cpu got exclusive access to req
1968 				 * and created a full blown socket.
1969 				 * Try to feed this packet to this socket
1970 				 * instead of discarding it.
1971 				 */
1972 				tcp_v4_restore_cb(skb);
1973 				sock_put(sk);
1974 				goto lookup;
1975 			}
1976 			goto discard_and_relse;
1977 		}
1978 		if (nsk == sk) {
1979 			reqsk_put(req);
1980 			tcp_v4_restore_cb(skb);
1981 		} else if (tcp_child_process(sk, nsk, skb)) {
1982 			tcp_v4_send_reset(nsk, skb);
1983 			goto discard_and_relse;
1984 		} else {
1985 			sock_put(sk);
1986 			return 0;
1987 		}
1988 	}
1989 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1990 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1991 		goto discard_and_relse;
1992 	}
1993 
1994 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1995 		goto discard_and_relse;
1996 
1997 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
1998 		goto discard_and_relse;
1999 
2000 	nf_reset_ct(skb);
2001 
2002 	if (tcp_filter(sk, skb))
2003 		goto discard_and_relse;
2004 	th = (const struct tcphdr *)skb->data;
2005 	iph = ip_hdr(skb);
2006 	tcp_v4_fill_cb(skb, iph, th);
2007 
2008 	skb->dev = NULL;
2009 
2010 	if (sk->sk_state == TCP_LISTEN) {
2011 		ret = tcp_v4_do_rcv(sk, skb);
2012 		goto put_and_return;
2013 	}
2014 
2015 	sk_incoming_cpu_update(sk);
2016 
2017 	bh_lock_sock_nested(sk);
2018 	tcp_segs_in(tcp_sk(sk), skb);
2019 	ret = 0;
2020 	if (!sock_owned_by_user(sk)) {
2021 		skb_to_free = sk->sk_rx_skb_cache;
2022 		sk->sk_rx_skb_cache = NULL;
2023 		ret = tcp_v4_do_rcv(sk, skb);
2024 	} else {
2025 		if (tcp_add_backlog(sk, skb))
2026 			goto discard_and_relse;
2027 		skb_to_free = NULL;
2028 	}
2029 	bh_unlock_sock(sk);
2030 	if (skb_to_free)
2031 		__kfree_skb(skb_to_free);
2032 
2033 put_and_return:
2034 	if (refcounted)
2035 		sock_put(sk);
2036 
2037 	return ret;
2038 
2039 no_tcp_socket:
2040 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2041 		goto discard_it;
2042 
2043 	tcp_v4_fill_cb(skb, iph, th);
2044 
2045 	if (tcp_checksum_complete(skb)) {
2046 csum_error:
2047 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2048 bad_packet:
2049 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2050 	} else {
2051 		tcp_v4_send_reset(NULL, skb);
2052 	}
2053 
2054 discard_it:
2055 	/* Discard frame. */
2056 	kfree_skb(skb);
2057 	return 0;
2058 
2059 discard_and_relse:
2060 	sk_drops_add(sk, skb);
2061 	if (refcounted)
2062 		sock_put(sk);
2063 	goto discard_it;
2064 
2065 do_time_wait:
2066 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2067 		inet_twsk_put(inet_twsk(sk));
2068 		goto discard_it;
2069 	}
2070 
2071 	tcp_v4_fill_cb(skb, iph, th);
2072 
2073 	if (tcp_checksum_complete(skb)) {
2074 		inet_twsk_put(inet_twsk(sk));
2075 		goto csum_error;
2076 	}
2077 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2078 	case TCP_TW_SYN: {
2079 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2080 							&tcp_hashinfo, skb,
2081 							__tcp_hdrlen(th),
2082 							iph->saddr, th->source,
2083 							iph->daddr, th->dest,
2084 							inet_iif(skb),
2085 							sdif);
2086 		if (sk2) {
2087 			inet_twsk_deschedule_put(inet_twsk(sk));
2088 			sk = sk2;
2089 			tcp_v4_restore_cb(skb);
2090 			refcounted = false;
2091 			goto process;
2092 		}
2093 	}
2094 		/* to ACK */
2095 		fallthrough;
2096 	case TCP_TW_ACK:
2097 		tcp_v4_timewait_ack(sk, skb);
2098 		break;
2099 	case TCP_TW_RST:
2100 		tcp_v4_send_reset(sk, skb);
2101 		inet_twsk_deschedule_put(inet_twsk(sk));
2102 		goto discard_it;
2103 	case TCP_TW_SUCCESS:;
2104 	}
2105 	goto discard_it;
2106 }
2107 
2108 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2109 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2110 	.twsk_unique	= tcp_twsk_unique,
2111 	.twsk_destructor= tcp_twsk_destructor,
2112 };
2113 
2114 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2115 {
2116 	struct dst_entry *dst = skb_dst(skb);
2117 
2118 	if (dst && dst_hold_safe(dst)) {
2119 		sk->sk_rx_dst = dst;
2120 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2121 	}
2122 }
2123 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2124 
2125 const struct inet_connection_sock_af_ops ipv4_specific = {
2126 	.queue_xmit	   = ip_queue_xmit,
2127 	.send_check	   = tcp_v4_send_check,
2128 	.rebuild_header	   = inet_sk_rebuild_header,
2129 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2130 	.conn_request	   = tcp_v4_conn_request,
2131 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2132 	.net_header_len	   = sizeof(struct iphdr),
2133 	.setsockopt	   = ip_setsockopt,
2134 	.getsockopt	   = ip_getsockopt,
2135 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2136 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2137 #ifdef CONFIG_COMPAT
2138 	.compat_setsockopt = compat_ip_setsockopt,
2139 	.compat_getsockopt = compat_ip_getsockopt,
2140 #endif
2141 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2142 };
2143 EXPORT_SYMBOL(ipv4_specific);
2144 
2145 #ifdef CONFIG_TCP_MD5SIG
2146 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2147 	.md5_lookup		= tcp_v4_md5_lookup,
2148 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2149 	.md5_parse		= tcp_v4_parse_md5_keys,
2150 };
2151 #endif
2152 
2153 /* NOTE: A lot of things set to zero explicitly by call to
2154  *       sk_alloc() so need not be done here.
2155  */
2156 static int tcp_v4_init_sock(struct sock *sk)
2157 {
2158 	struct inet_connection_sock *icsk = inet_csk(sk);
2159 
2160 	tcp_init_sock(sk);
2161 
2162 	icsk->icsk_af_ops = &ipv4_specific;
2163 
2164 #ifdef CONFIG_TCP_MD5SIG
2165 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2166 #endif
2167 
2168 	return 0;
2169 }
2170 
2171 void tcp_v4_destroy_sock(struct sock *sk)
2172 {
2173 	struct tcp_sock *tp = tcp_sk(sk);
2174 
2175 	trace_tcp_destroy_sock(sk);
2176 
2177 	tcp_clear_xmit_timers(sk);
2178 
2179 	tcp_cleanup_congestion_control(sk);
2180 
2181 	tcp_cleanup_ulp(sk);
2182 
2183 	/* Cleanup up the write buffer. */
2184 	tcp_write_queue_purge(sk);
2185 
2186 	/* Check if we want to disable active TFO */
2187 	tcp_fastopen_active_disable_ofo_check(sk);
2188 
2189 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2190 	skb_rbtree_purge(&tp->out_of_order_queue);
2191 
2192 #ifdef CONFIG_TCP_MD5SIG
2193 	/* Clean up the MD5 key list, if any */
2194 	if (tp->md5sig_info) {
2195 		tcp_clear_md5_list(sk);
2196 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2197 		tp->md5sig_info = NULL;
2198 	}
2199 #endif
2200 
2201 	/* Clean up a referenced TCP bind bucket. */
2202 	if (inet_csk(sk)->icsk_bind_hash)
2203 		inet_put_port(sk);
2204 
2205 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2206 
2207 	/* If socket is aborted during connect operation */
2208 	tcp_free_fastopen_req(tp);
2209 	tcp_fastopen_destroy_cipher(sk);
2210 	tcp_saved_syn_free(tp);
2211 
2212 	sk_sockets_allocated_dec(sk);
2213 }
2214 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2215 
2216 #ifdef CONFIG_PROC_FS
2217 /* Proc filesystem TCP sock list dumping. */
2218 
2219 /*
2220  * Get next listener socket follow cur.  If cur is NULL, get first socket
2221  * starting from bucket given in st->bucket; when st->bucket is zero the
2222  * very first socket in the hash table is returned.
2223  */
2224 static void *listening_get_next(struct seq_file *seq, void *cur)
2225 {
2226 	struct tcp_seq_afinfo *afinfo;
2227 	struct tcp_iter_state *st = seq->private;
2228 	struct net *net = seq_file_net(seq);
2229 	struct inet_listen_hashbucket *ilb;
2230 	struct hlist_nulls_node *node;
2231 	struct sock *sk = cur;
2232 
2233 	if (st->bpf_seq_afinfo)
2234 		afinfo = st->bpf_seq_afinfo;
2235 	else
2236 		afinfo = PDE_DATA(file_inode(seq->file));
2237 
2238 	if (!sk) {
2239 get_head:
2240 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2241 		spin_lock(&ilb->lock);
2242 		sk = sk_nulls_head(&ilb->nulls_head);
2243 		st->offset = 0;
2244 		goto get_sk;
2245 	}
2246 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2247 	++st->num;
2248 	++st->offset;
2249 
2250 	sk = sk_nulls_next(sk);
2251 get_sk:
2252 	sk_nulls_for_each_from(sk, node) {
2253 		if (!net_eq(sock_net(sk), net))
2254 			continue;
2255 		if (afinfo->family == AF_UNSPEC ||
2256 		    sk->sk_family == afinfo->family)
2257 			return sk;
2258 	}
2259 	spin_unlock(&ilb->lock);
2260 	st->offset = 0;
2261 	if (++st->bucket < INET_LHTABLE_SIZE)
2262 		goto get_head;
2263 	return NULL;
2264 }
2265 
2266 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2267 {
2268 	struct tcp_iter_state *st = seq->private;
2269 	void *rc;
2270 
2271 	st->bucket = 0;
2272 	st->offset = 0;
2273 	rc = listening_get_next(seq, NULL);
2274 
2275 	while (rc && *pos) {
2276 		rc = listening_get_next(seq, rc);
2277 		--*pos;
2278 	}
2279 	return rc;
2280 }
2281 
2282 static inline bool empty_bucket(const struct tcp_iter_state *st)
2283 {
2284 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2285 }
2286 
2287 /*
2288  * Get first established socket starting from bucket given in st->bucket.
2289  * If st->bucket is zero, the very first socket in the hash is returned.
2290  */
2291 static void *established_get_first(struct seq_file *seq)
2292 {
2293 	struct tcp_seq_afinfo *afinfo;
2294 	struct tcp_iter_state *st = seq->private;
2295 	struct net *net = seq_file_net(seq);
2296 	void *rc = NULL;
2297 
2298 	if (st->bpf_seq_afinfo)
2299 		afinfo = st->bpf_seq_afinfo;
2300 	else
2301 		afinfo = PDE_DATA(file_inode(seq->file));
2302 
2303 	st->offset = 0;
2304 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2305 		struct sock *sk;
2306 		struct hlist_nulls_node *node;
2307 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2308 
2309 		/* Lockless fast path for the common case of empty buckets */
2310 		if (empty_bucket(st))
2311 			continue;
2312 
2313 		spin_lock_bh(lock);
2314 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2315 			if ((afinfo->family != AF_UNSPEC &&
2316 			     sk->sk_family != afinfo->family) ||
2317 			    !net_eq(sock_net(sk), net)) {
2318 				continue;
2319 			}
2320 			rc = sk;
2321 			goto out;
2322 		}
2323 		spin_unlock_bh(lock);
2324 	}
2325 out:
2326 	return rc;
2327 }
2328 
2329 static void *established_get_next(struct seq_file *seq, void *cur)
2330 {
2331 	struct tcp_seq_afinfo *afinfo;
2332 	struct sock *sk = cur;
2333 	struct hlist_nulls_node *node;
2334 	struct tcp_iter_state *st = seq->private;
2335 	struct net *net = seq_file_net(seq);
2336 
2337 	if (st->bpf_seq_afinfo)
2338 		afinfo = st->bpf_seq_afinfo;
2339 	else
2340 		afinfo = PDE_DATA(file_inode(seq->file));
2341 
2342 	++st->num;
2343 	++st->offset;
2344 
2345 	sk = sk_nulls_next(sk);
2346 
2347 	sk_nulls_for_each_from(sk, node) {
2348 		if ((afinfo->family == AF_UNSPEC ||
2349 		     sk->sk_family == afinfo->family) &&
2350 		    net_eq(sock_net(sk), net))
2351 			return sk;
2352 	}
2353 
2354 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2355 	++st->bucket;
2356 	return established_get_first(seq);
2357 }
2358 
2359 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2360 {
2361 	struct tcp_iter_state *st = seq->private;
2362 	void *rc;
2363 
2364 	st->bucket = 0;
2365 	rc = established_get_first(seq);
2366 
2367 	while (rc && pos) {
2368 		rc = established_get_next(seq, rc);
2369 		--pos;
2370 	}
2371 	return rc;
2372 }
2373 
2374 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2375 {
2376 	void *rc;
2377 	struct tcp_iter_state *st = seq->private;
2378 
2379 	st->state = TCP_SEQ_STATE_LISTENING;
2380 	rc	  = listening_get_idx(seq, &pos);
2381 
2382 	if (!rc) {
2383 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2384 		rc	  = established_get_idx(seq, pos);
2385 	}
2386 
2387 	return rc;
2388 }
2389 
2390 static void *tcp_seek_last_pos(struct seq_file *seq)
2391 {
2392 	struct tcp_iter_state *st = seq->private;
2393 	int offset = st->offset;
2394 	int orig_num = st->num;
2395 	void *rc = NULL;
2396 
2397 	switch (st->state) {
2398 	case TCP_SEQ_STATE_LISTENING:
2399 		if (st->bucket >= INET_LHTABLE_SIZE)
2400 			break;
2401 		st->state = TCP_SEQ_STATE_LISTENING;
2402 		rc = listening_get_next(seq, NULL);
2403 		while (offset-- && rc)
2404 			rc = listening_get_next(seq, rc);
2405 		if (rc)
2406 			break;
2407 		st->bucket = 0;
2408 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2409 		fallthrough;
2410 	case TCP_SEQ_STATE_ESTABLISHED:
2411 		if (st->bucket > tcp_hashinfo.ehash_mask)
2412 			break;
2413 		rc = established_get_first(seq);
2414 		while (offset-- && rc)
2415 			rc = established_get_next(seq, rc);
2416 	}
2417 
2418 	st->num = orig_num;
2419 
2420 	return rc;
2421 }
2422 
2423 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2424 {
2425 	struct tcp_iter_state *st = seq->private;
2426 	void *rc;
2427 
2428 	if (*pos && *pos == st->last_pos) {
2429 		rc = tcp_seek_last_pos(seq);
2430 		if (rc)
2431 			goto out;
2432 	}
2433 
2434 	st->state = TCP_SEQ_STATE_LISTENING;
2435 	st->num = 0;
2436 	st->bucket = 0;
2437 	st->offset = 0;
2438 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2439 
2440 out:
2441 	st->last_pos = *pos;
2442 	return rc;
2443 }
2444 EXPORT_SYMBOL(tcp_seq_start);
2445 
2446 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2447 {
2448 	struct tcp_iter_state *st = seq->private;
2449 	void *rc = NULL;
2450 
2451 	if (v == SEQ_START_TOKEN) {
2452 		rc = tcp_get_idx(seq, 0);
2453 		goto out;
2454 	}
2455 
2456 	switch (st->state) {
2457 	case TCP_SEQ_STATE_LISTENING:
2458 		rc = listening_get_next(seq, v);
2459 		if (!rc) {
2460 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2461 			st->bucket = 0;
2462 			st->offset = 0;
2463 			rc	  = established_get_first(seq);
2464 		}
2465 		break;
2466 	case TCP_SEQ_STATE_ESTABLISHED:
2467 		rc = established_get_next(seq, v);
2468 		break;
2469 	}
2470 out:
2471 	++*pos;
2472 	st->last_pos = *pos;
2473 	return rc;
2474 }
2475 EXPORT_SYMBOL(tcp_seq_next);
2476 
2477 void tcp_seq_stop(struct seq_file *seq, void *v)
2478 {
2479 	struct tcp_iter_state *st = seq->private;
2480 
2481 	switch (st->state) {
2482 	case TCP_SEQ_STATE_LISTENING:
2483 		if (v != SEQ_START_TOKEN)
2484 			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2485 		break;
2486 	case TCP_SEQ_STATE_ESTABLISHED:
2487 		if (v)
2488 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2489 		break;
2490 	}
2491 }
2492 EXPORT_SYMBOL(tcp_seq_stop);
2493 
2494 static void get_openreq4(const struct request_sock *req,
2495 			 struct seq_file *f, int i)
2496 {
2497 	const struct inet_request_sock *ireq = inet_rsk(req);
2498 	long delta = req->rsk_timer.expires - jiffies;
2499 
2500 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2501 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2502 		i,
2503 		ireq->ir_loc_addr,
2504 		ireq->ir_num,
2505 		ireq->ir_rmt_addr,
2506 		ntohs(ireq->ir_rmt_port),
2507 		TCP_SYN_RECV,
2508 		0, 0, /* could print option size, but that is af dependent. */
2509 		1,    /* timers active (only the expire timer) */
2510 		jiffies_delta_to_clock_t(delta),
2511 		req->num_timeout,
2512 		from_kuid_munged(seq_user_ns(f),
2513 				 sock_i_uid(req->rsk_listener)),
2514 		0,  /* non standard timer */
2515 		0, /* open_requests have no inode */
2516 		0,
2517 		req);
2518 }
2519 
2520 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2521 {
2522 	int timer_active;
2523 	unsigned long timer_expires;
2524 	const struct tcp_sock *tp = tcp_sk(sk);
2525 	const struct inet_connection_sock *icsk = inet_csk(sk);
2526 	const struct inet_sock *inet = inet_sk(sk);
2527 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2528 	__be32 dest = inet->inet_daddr;
2529 	__be32 src = inet->inet_rcv_saddr;
2530 	__u16 destp = ntohs(inet->inet_dport);
2531 	__u16 srcp = ntohs(inet->inet_sport);
2532 	int rx_queue;
2533 	int state;
2534 
2535 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2536 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2537 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2538 		timer_active	= 1;
2539 		timer_expires	= icsk->icsk_timeout;
2540 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2541 		timer_active	= 4;
2542 		timer_expires	= icsk->icsk_timeout;
2543 	} else if (timer_pending(&sk->sk_timer)) {
2544 		timer_active	= 2;
2545 		timer_expires	= sk->sk_timer.expires;
2546 	} else {
2547 		timer_active	= 0;
2548 		timer_expires = jiffies;
2549 	}
2550 
2551 	state = inet_sk_state_load(sk);
2552 	if (state == TCP_LISTEN)
2553 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2554 	else
2555 		/* Because we don't lock the socket,
2556 		 * we might find a transient negative value.
2557 		 */
2558 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2559 				      READ_ONCE(tp->copied_seq), 0);
2560 
2561 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2562 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2563 		i, src, srcp, dest, destp, state,
2564 		READ_ONCE(tp->write_seq) - tp->snd_una,
2565 		rx_queue,
2566 		timer_active,
2567 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2568 		icsk->icsk_retransmits,
2569 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2570 		icsk->icsk_probes_out,
2571 		sock_i_ino(sk),
2572 		refcount_read(&sk->sk_refcnt), sk,
2573 		jiffies_to_clock_t(icsk->icsk_rto),
2574 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2575 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2576 		tp->snd_cwnd,
2577 		state == TCP_LISTEN ?
2578 		    fastopenq->max_qlen :
2579 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2580 }
2581 
2582 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2583 			       struct seq_file *f, int i)
2584 {
2585 	long delta = tw->tw_timer.expires - jiffies;
2586 	__be32 dest, src;
2587 	__u16 destp, srcp;
2588 
2589 	dest  = tw->tw_daddr;
2590 	src   = tw->tw_rcv_saddr;
2591 	destp = ntohs(tw->tw_dport);
2592 	srcp  = ntohs(tw->tw_sport);
2593 
2594 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2595 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2596 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2597 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2598 		refcount_read(&tw->tw_refcnt), tw);
2599 }
2600 
2601 #define TMPSZ 150
2602 
2603 static int tcp4_seq_show(struct seq_file *seq, void *v)
2604 {
2605 	struct tcp_iter_state *st;
2606 	struct sock *sk = v;
2607 
2608 	seq_setwidth(seq, TMPSZ - 1);
2609 	if (v == SEQ_START_TOKEN) {
2610 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2611 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2612 			   "inode");
2613 		goto out;
2614 	}
2615 	st = seq->private;
2616 
2617 	if (sk->sk_state == TCP_TIME_WAIT)
2618 		get_timewait4_sock(v, seq, st->num);
2619 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2620 		get_openreq4(v, seq, st->num);
2621 	else
2622 		get_tcp4_sock(v, seq, st->num);
2623 out:
2624 	seq_pad(seq, '\n');
2625 	return 0;
2626 }
2627 
2628 #ifdef CONFIG_BPF_SYSCALL
2629 struct bpf_iter__tcp {
2630 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2631 	__bpf_md_ptr(struct sock_common *, sk_common);
2632 	uid_t uid __aligned(8);
2633 };
2634 
2635 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2636 			     struct sock_common *sk_common, uid_t uid)
2637 {
2638 	struct bpf_iter__tcp ctx;
2639 
2640 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2641 	ctx.meta = meta;
2642 	ctx.sk_common = sk_common;
2643 	ctx.uid = uid;
2644 	return bpf_iter_run_prog(prog, &ctx);
2645 }
2646 
2647 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2648 {
2649 	struct bpf_iter_meta meta;
2650 	struct bpf_prog *prog;
2651 	struct sock *sk = v;
2652 	uid_t uid;
2653 
2654 	if (v == SEQ_START_TOKEN)
2655 		return 0;
2656 
2657 	if (sk->sk_state == TCP_TIME_WAIT) {
2658 		uid = 0;
2659 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2660 		const struct request_sock *req = v;
2661 
2662 		uid = from_kuid_munged(seq_user_ns(seq),
2663 				       sock_i_uid(req->rsk_listener));
2664 	} else {
2665 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2666 	}
2667 
2668 	meta.seq = seq;
2669 	prog = bpf_iter_get_info(&meta, false);
2670 	return tcp_prog_seq_show(prog, &meta, v, uid);
2671 }
2672 
2673 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2674 {
2675 	struct bpf_iter_meta meta;
2676 	struct bpf_prog *prog;
2677 
2678 	if (!v) {
2679 		meta.seq = seq;
2680 		prog = bpf_iter_get_info(&meta, true);
2681 		if (prog)
2682 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2683 	}
2684 
2685 	tcp_seq_stop(seq, v);
2686 }
2687 
2688 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2689 	.show		= bpf_iter_tcp_seq_show,
2690 	.start		= tcp_seq_start,
2691 	.next		= tcp_seq_next,
2692 	.stop		= bpf_iter_tcp_seq_stop,
2693 };
2694 #endif
2695 
2696 static const struct seq_operations tcp4_seq_ops = {
2697 	.show		= tcp4_seq_show,
2698 	.start		= tcp_seq_start,
2699 	.next		= tcp_seq_next,
2700 	.stop		= tcp_seq_stop,
2701 };
2702 
2703 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2704 	.family		= AF_INET,
2705 };
2706 
2707 static int __net_init tcp4_proc_init_net(struct net *net)
2708 {
2709 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2710 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2711 		return -ENOMEM;
2712 	return 0;
2713 }
2714 
2715 static void __net_exit tcp4_proc_exit_net(struct net *net)
2716 {
2717 	remove_proc_entry("tcp", net->proc_net);
2718 }
2719 
2720 static struct pernet_operations tcp4_net_ops = {
2721 	.init = tcp4_proc_init_net,
2722 	.exit = tcp4_proc_exit_net,
2723 };
2724 
2725 int __init tcp4_proc_init(void)
2726 {
2727 	return register_pernet_subsys(&tcp4_net_ops);
2728 }
2729 
2730 void tcp4_proc_exit(void)
2731 {
2732 	unregister_pernet_subsys(&tcp4_net_ops);
2733 }
2734 #endif /* CONFIG_PROC_FS */
2735 
2736 struct proto tcp_prot = {
2737 	.name			= "TCP",
2738 	.owner			= THIS_MODULE,
2739 	.close			= tcp_close,
2740 	.pre_connect		= tcp_v4_pre_connect,
2741 	.connect		= tcp_v4_connect,
2742 	.disconnect		= tcp_disconnect,
2743 	.accept			= inet_csk_accept,
2744 	.ioctl			= tcp_ioctl,
2745 	.init			= tcp_v4_init_sock,
2746 	.destroy		= tcp_v4_destroy_sock,
2747 	.shutdown		= tcp_shutdown,
2748 	.setsockopt		= tcp_setsockopt,
2749 	.getsockopt		= tcp_getsockopt,
2750 	.keepalive		= tcp_set_keepalive,
2751 	.recvmsg		= tcp_recvmsg,
2752 	.sendmsg		= tcp_sendmsg,
2753 	.sendpage		= tcp_sendpage,
2754 	.backlog_rcv		= tcp_v4_do_rcv,
2755 	.release_cb		= tcp_release_cb,
2756 	.hash			= inet_hash,
2757 	.unhash			= inet_unhash,
2758 	.get_port		= inet_csk_get_port,
2759 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2760 	.leave_memory_pressure	= tcp_leave_memory_pressure,
2761 	.stream_memory_free	= tcp_stream_memory_free,
2762 	.sockets_allocated	= &tcp_sockets_allocated,
2763 	.orphan_count		= &tcp_orphan_count,
2764 	.memory_allocated	= &tcp_memory_allocated,
2765 	.memory_pressure	= &tcp_memory_pressure,
2766 	.sysctl_mem		= sysctl_tcp_mem,
2767 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
2768 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
2769 	.max_header		= MAX_TCP_HEADER,
2770 	.obj_size		= sizeof(struct tcp_sock),
2771 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
2772 	.twsk_prot		= &tcp_timewait_sock_ops,
2773 	.rsk_prot		= &tcp_request_sock_ops,
2774 	.h.hashinfo		= &tcp_hashinfo,
2775 	.no_autobind		= true,
2776 #ifdef CONFIG_COMPAT
2777 	.compat_setsockopt	= compat_tcp_setsockopt,
2778 	.compat_getsockopt	= compat_tcp_getsockopt,
2779 #endif
2780 	.diag_destroy		= tcp_abort,
2781 };
2782 EXPORT_SYMBOL(tcp_prot);
2783 
2784 static void __net_exit tcp_sk_exit(struct net *net)
2785 {
2786 	int cpu;
2787 
2788 	if (net->ipv4.tcp_congestion_control)
2789 		bpf_module_put(net->ipv4.tcp_congestion_control,
2790 			       net->ipv4.tcp_congestion_control->owner);
2791 
2792 	for_each_possible_cpu(cpu)
2793 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2794 	free_percpu(net->ipv4.tcp_sk);
2795 }
2796 
2797 static int __net_init tcp_sk_init(struct net *net)
2798 {
2799 	int res, cpu, cnt;
2800 
2801 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2802 	if (!net->ipv4.tcp_sk)
2803 		return -ENOMEM;
2804 
2805 	for_each_possible_cpu(cpu) {
2806 		struct sock *sk;
2807 
2808 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2809 					   IPPROTO_TCP, net);
2810 		if (res)
2811 			goto fail;
2812 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2813 
2814 		/* Please enforce IP_DF and IPID==0 for RST and
2815 		 * ACK sent in SYN-RECV and TIME-WAIT state.
2816 		 */
2817 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2818 
2819 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2820 	}
2821 
2822 	net->ipv4.sysctl_tcp_ecn = 2;
2823 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
2824 
2825 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2826 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2827 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2828 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2829 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2830 
2831 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2832 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2833 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2834 
2835 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2836 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2837 	net->ipv4.sysctl_tcp_syncookies = 1;
2838 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2839 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2840 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2841 	net->ipv4.sysctl_tcp_orphan_retries = 0;
2842 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2843 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2844 	net->ipv4.sysctl_tcp_tw_reuse = 2;
2845 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2846 
2847 	cnt = tcp_hashinfo.ehash_mask + 1;
2848 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2849 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2850 
2851 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2852 	net->ipv4.sysctl_tcp_sack = 1;
2853 	net->ipv4.sysctl_tcp_window_scaling = 1;
2854 	net->ipv4.sysctl_tcp_timestamps = 1;
2855 	net->ipv4.sysctl_tcp_early_retrans = 3;
2856 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2857 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
2858 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
2859 	net->ipv4.sysctl_tcp_max_reordering = 300;
2860 	net->ipv4.sysctl_tcp_dsack = 1;
2861 	net->ipv4.sysctl_tcp_app_win = 31;
2862 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
2863 	net->ipv4.sysctl_tcp_frto = 2;
2864 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2865 	/* This limits the percentage of the congestion window which we
2866 	 * will allow a single TSO frame to consume.  Building TSO frames
2867 	 * which are too large can cause TCP streams to be bursty.
2868 	 */
2869 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2870 	/* Default TSQ limit of 16 TSO segments */
2871 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2872 	/* rfc5961 challenge ack rate limiting */
2873 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2874 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
2875 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2876 	net->ipv4.sysctl_tcp_autocorking = 1;
2877 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2878 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2879 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2880 	if (net != &init_net) {
2881 		memcpy(net->ipv4.sysctl_tcp_rmem,
2882 		       init_net.ipv4.sysctl_tcp_rmem,
2883 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
2884 		memcpy(net->ipv4.sysctl_tcp_wmem,
2885 		       init_net.ipv4.sysctl_tcp_wmem,
2886 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
2887 	}
2888 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2889 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2890 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2891 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2892 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2893 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2894 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2895 
2896 	/* Reno is always built in */
2897 	if (!net_eq(net, &init_net) &&
2898 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2899 			       init_net.ipv4.tcp_congestion_control->owner))
2900 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2901 	else
2902 		net->ipv4.tcp_congestion_control = &tcp_reno;
2903 
2904 	return 0;
2905 fail:
2906 	tcp_sk_exit(net);
2907 
2908 	return res;
2909 }
2910 
2911 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2912 {
2913 	struct net *net;
2914 
2915 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
2916 
2917 	list_for_each_entry(net, net_exit_list, exit_list)
2918 		tcp_fastopen_ctx_destroy(net);
2919 }
2920 
2921 static struct pernet_operations __net_initdata tcp_sk_ops = {
2922        .init	   = tcp_sk_init,
2923        .exit	   = tcp_sk_exit,
2924        .exit_batch = tcp_sk_exit_batch,
2925 };
2926 
2927 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2928 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
2929 		     struct sock_common *sk_common, uid_t uid)
2930 
2931 static int bpf_iter_init_tcp(void *priv_data)
2932 {
2933 	struct tcp_iter_state *st = priv_data;
2934 	struct tcp_seq_afinfo *afinfo;
2935 	int ret;
2936 
2937 	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
2938 	if (!afinfo)
2939 		return -ENOMEM;
2940 
2941 	afinfo->family = AF_UNSPEC;
2942 	st->bpf_seq_afinfo = afinfo;
2943 	ret = bpf_iter_init_seq_net(priv_data);
2944 	if (ret)
2945 		kfree(afinfo);
2946 	return ret;
2947 }
2948 
2949 static void bpf_iter_fini_tcp(void *priv_data)
2950 {
2951 	struct tcp_iter_state *st = priv_data;
2952 
2953 	kfree(st->bpf_seq_afinfo);
2954 	bpf_iter_fini_seq_net(priv_data);
2955 }
2956 
2957 static const struct bpf_iter_reg tcp_reg_info = {
2958 	.target			= "tcp",
2959 	.seq_ops		= &bpf_iter_tcp_seq_ops,
2960 	.init_seq_private	= bpf_iter_init_tcp,
2961 	.fini_seq_private	= bpf_iter_fini_tcp,
2962 	.seq_priv_size		= sizeof(struct tcp_iter_state),
2963 	.ctx_arg_info_size	= 1,
2964 	.ctx_arg_info		= {
2965 		{ offsetof(struct bpf_iter__tcp, sk_common),
2966 		  PTR_TO_BTF_ID_OR_NULL },
2967 	},
2968 };
2969 
2970 static void __init bpf_iter_register(void)
2971 {
2972 	if (bpf_iter_reg_target(&tcp_reg_info))
2973 		pr_warn("Warning: could not register bpf iterator tcp\n");
2974 }
2975 
2976 #endif
2977 
2978 void __init tcp_v4_init(void)
2979 {
2980 	if (register_pernet_subsys(&tcp_sk_ops))
2981 		panic("Failed to create the TCP control socket.\n");
2982 
2983 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
2984 	bpf_iter_register();
2985 #endif
2986 }
2987