xref: /linux/net/ipv4/tcp_ipv4.c (revision 08f3e0873ac203449465c2b8473d684e2f9f41d1)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72 
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80 
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83 
84 #include <trace/events/tcp.h>
85 
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90 
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93 
94 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95 {
96 	return secure_tcp_seq(ip_hdr(skb)->daddr,
97 			      ip_hdr(skb)->saddr,
98 			      tcp_hdr(skb)->dest,
99 			      tcp_hdr(skb)->source);
100 }
101 
102 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103 {
104 	return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105 }
106 
107 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108 {
109 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 	struct tcp_sock *tp = tcp_sk(sk);
112 	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113 
114 	if (reuse == 2) {
115 		/* Still does not detect *everything* that goes through
116 		 * lo, since we require a loopback src or dst address
117 		 * or direct binding to 'lo' interface.
118 		 */
119 		bool loopback = false;
120 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 			loopback = true;
122 #if IS_ENABLED(CONFIG_IPV6)
123 		if (tw->tw_family == AF_INET6) {
124 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 				loopback = true;
129 		} else
130 #endif
131 		{
132 			if (ipv4_is_loopback(tw->tw_daddr) ||
133 			    ipv4_is_loopback(tw->tw_rcv_saddr))
134 				loopback = true;
135 		}
136 		if (!loopback)
137 			reuse = 0;
138 	}
139 
140 	/* With PAWS, it is safe from the viewpoint
141 	   of data integrity. Even without PAWS it is safe provided sequence
142 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143 
144 	   Actually, the idea is close to VJ's one, only timestamp cache is
145 	   held not per host, but per port pair and TW bucket is used as state
146 	   holder.
147 
148 	   If TW bucket has been already destroyed we fall back to VJ's scheme
149 	   and use initial timestamp retrieved from peer table.
150 	 */
151 	if (tcptw->tw_ts_recent_stamp &&
152 	    (!twp || (reuse && time_after32(ktime_get_seconds(),
153 					    tcptw->tw_ts_recent_stamp)))) {
154 		/* In case of repair and re-using TIME-WAIT sockets we still
155 		 * want to be sure that it is safe as above but honor the
156 		 * sequence numbers and time stamps set as part of the repair
157 		 * process.
158 		 *
159 		 * Without this check re-using a TIME-WAIT socket with TCP
160 		 * repair would accumulate a -1 on the repair assigned
161 		 * sequence number. The first time it is reused the sequence
162 		 * is -1, the second time -2, etc. This fixes that issue
163 		 * without appearing to create any others.
164 		 */
165 		if (likely(!tp->repair)) {
166 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167 
168 			if (!seq)
169 				seq = 1;
170 			WRITE_ONCE(tp->write_seq, seq);
171 			tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
172 			tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 		}
174 		sock_hold(sktw);
175 		return 1;
176 	}
177 
178 	return 0;
179 }
180 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181 
182 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 			      int addr_len)
184 {
185 	/* This check is replicated from tcp_v4_connect() and intended to
186 	 * prevent BPF program called below from accessing bytes that are out
187 	 * of the bound specified by user in addr_len.
188 	 */
189 	if (addr_len < sizeof(struct sockaddr_in))
190 		return -EINVAL;
191 
192 	sock_owned_by_me(sk);
193 
194 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195 }
196 
197 /* This will initiate an outgoing connection. */
198 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199 {
200 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 	struct inet_sock *inet = inet_sk(sk);
202 	struct tcp_sock *tp = tcp_sk(sk);
203 	__be16 orig_sport, orig_dport;
204 	__be32 daddr, nexthop;
205 	struct flowi4 *fl4;
206 	struct rtable *rt;
207 	int err;
208 	struct ip_options_rcu *inet_opt;
209 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210 
211 	if (addr_len < sizeof(struct sockaddr_in))
212 		return -EINVAL;
213 
214 	if (usin->sin_family != AF_INET)
215 		return -EAFNOSUPPORT;
216 
217 	nexthop = daddr = usin->sin_addr.s_addr;
218 	inet_opt = rcu_dereference_protected(inet->inet_opt,
219 					     lockdep_sock_is_held(sk));
220 	if (inet_opt && inet_opt->opt.srr) {
221 		if (!daddr)
222 			return -EINVAL;
223 		nexthop = inet_opt->opt.faddr;
224 	}
225 
226 	orig_sport = inet->inet_sport;
227 	orig_dport = usin->sin_port;
228 	fl4 = &inet->cork.fl.u.ip4;
229 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 			      IPPROTO_TCP,
232 			      orig_sport, orig_dport, sk);
233 	if (IS_ERR(rt)) {
234 		err = PTR_ERR(rt);
235 		if (err == -ENETUNREACH)
236 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 		return err;
238 	}
239 
240 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 		ip_rt_put(rt);
242 		return -ENETUNREACH;
243 	}
244 
245 	if (!inet_opt || !inet_opt->opt.srr)
246 		daddr = fl4->daddr;
247 
248 	if (!inet->inet_saddr)
249 		inet->inet_saddr = fl4->saddr;
250 	sk_rcv_saddr_set(sk, inet->inet_saddr);
251 
252 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 		/* Reset inherited state */
254 		tp->rx_opt.ts_recent	   = 0;
255 		tp->rx_opt.ts_recent_stamp = 0;
256 		if (likely(!tp->repair))
257 			WRITE_ONCE(tp->write_seq, 0);
258 	}
259 
260 	inet->inet_dport = usin->sin_port;
261 	sk_daddr_set(sk, daddr);
262 
263 	inet_csk(sk)->icsk_ext_hdr_len = 0;
264 	if (inet_opt)
265 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266 
267 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268 
269 	/* Socket identity is still unknown (sport may be zero).
270 	 * However we set state to SYN-SENT and not releasing socket
271 	 * lock select source port, enter ourselves into the hash tables and
272 	 * complete initialization after this.
273 	 */
274 	tcp_set_state(sk, TCP_SYN_SENT);
275 	err = inet_hash_connect(tcp_death_row, sk);
276 	if (err)
277 		goto failure;
278 
279 	sk_set_txhash(sk);
280 
281 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 			       inet->inet_sport, inet->inet_dport, sk);
283 	if (IS_ERR(rt)) {
284 		err = PTR_ERR(rt);
285 		rt = NULL;
286 		goto failure;
287 	}
288 	/* OK, now commit destination to socket.  */
289 	sk->sk_gso_type = SKB_GSO_TCPV4;
290 	sk_setup_caps(sk, &rt->dst);
291 	rt = NULL;
292 
293 	if (likely(!tp->repair)) {
294 		if (!tp->write_seq)
295 			WRITE_ONCE(tp->write_seq,
296 				   secure_tcp_seq(inet->inet_saddr,
297 						  inet->inet_daddr,
298 						  inet->inet_sport,
299 						  usin->sin_port));
300 		tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 						 inet->inet_saddr,
302 						 inet->inet_daddr);
303 	}
304 
305 	inet->inet_id = prandom_u32();
306 
307 	if (tcp_fastopen_defer_connect(sk, &err))
308 		return err;
309 	if (err)
310 		goto failure;
311 
312 	err = tcp_connect(sk);
313 
314 	if (err)
315 		goto failure;
316 
317 	return 0;
318 
319 failure:
320 	/*
321 	 * This unhashes the socket and releases the local port,
322 	 * if necessary.
323 	 */
324 	tcp_set_state(sk, TCP_CLOSE);
325 	ip_rt_put(rt);
326 	sk->sk_route_caps = 0;
327 	inet->inet_dport = 0;
328 	return err;
329 }
330 EXPORT_SYMBOL(tcp_v4_connect);
331 
332 /*
333  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334  * It can be called through tcp_release_cb() if socket was owned by user
335  * at the time tcp_v4_err() was called to handle ICMP message.
336  */
337 void tcp_v4_mtu_reduced(struct sock *sk)
338 {
339 	struct inet_sock *inet = inet_sk(sk);
340 	struct dst_entry *dst;
341 	u32 mtu;
342 
343 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 		return;
345 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 	dst = inet_csk_update_pmtu(sk, mtu);
347 	if (!dst)
348 		return;
349 
350 	/* Something is about to be wrong... Remember soft error
351 	 * for the case, if this connection will not able to recover.
352 	 */
353 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 		sk->sk_err_soft = EMSGSIZE;
355 
356 	mtu = dst_mtu(dst);
357 
358 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 	    ip_sk_accept_pmtu(sk) &&
360 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 		tcp_sync_mss(sk, mtu);
362 
363 		/* Resend the TCP packet because it's
364 		 * clear that the old packet has been
365 		 * dropped. This is the new "fast" path mtu
366 		 * discovery.
367 		 */
368 		tcp_simple_retransmit(sk);
369 	} /* else let the usual retransmit timer handle it */
370 }
371 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372 
373 static void do_redirect(struct sk_buff *skb, struct sock *sk)
374 {
375 	struct dst_entry *dst = __sk_dst_check(sk, 0);
376 
377 	if (dst)
378 		dst->ops->redirect(dst, sk, skb);
379 }
380 
381 
382 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384 {
385 	struct request_sock *req = inet_reqsk(sk);
386 	struct net *net = sock_net(sk);
387 
388 	/* ICMPs are not backlogged, hence we cannot get
389 	 * an established socket here.
390 	 */
391 	if (seq != tcp_rsk(req)->snt_isn) {
392 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 	} else if (abort) {
394 		/*
395 		 * Still in SYN_RECV, just remove it silently.
396 		 * There is no good way to pass the error to the newly
397 		 * created socket, and POSIX does not want network
398 		 * errors returned from accept().
399 		 */
400 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 		tcp_listendrop(req->rsk_listener);
402 	}
403 	reqsk_put(req);
404 }
405 EXPORT_SYMBOL(tcp_req_err);
406 
407 /* TCP-LD (RFC 6069) logic */
408 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409 {
410 	struct inet_connection_sock *icsk = inet_csk(sk);
411 	struct tcp_sock *tp = tcp_sk(sk);
412 	struct sk_buff *skb;
413 	s32 remaining;
414 	u32 delta_us;
415 
416 	if (sock_owned_by_user(sk))
417 		return;
418 
419 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420 	    !icsk->icsk_backoff)
421 		return;
422 
423 	skb = tcp_rtx_queue_head(sk);
424 	if (WARN_ON_ONCE(!skb))
425 		return;
426 
427 	icsk->icsk_backoff--;
428 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430 
431 	tcp_mstamp_refresh(tp);
432 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434 
435 	if (remaining > 0) {
436 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 					  remaining, TCP_RTO_MAX);
438 	} else {
439 		/* RTO revert clocked out retransmission.
440 		 * Will retransmit now.
441 		 */
442 		tcp_retransmit_timer(sk);
443 	}
444 }
445 EXPORT_SYMBOL(tcp_ld_RTO_revert);
446 
447 /*
448  * This routine is called by the ICMP module when it gets some
449  * sort of error condition.  If err < 0 then the socket should
450  * be closed and the error returned to the user.  If err > 0
451  * it's just the icmp type << 8 | icmp code.  After adjustment
452  * header points to the first 8 bytes of the tcp header.  We need
453  * to find the appropriate port.
454  *
455  * The locking strategy used here is very "optimistic". When
456  * someone else accesses the socket the ICMP is just dropped
457  * and for some paths there is no check at all.
458  * A more general error queue to queue errors for later handling
459  * is probably better.
460  *
461  */
462 
463 int tcp_v4_err(struct sk_buff *skb, u32 info)
464 {
465 	const struct iphdr *iph = (const struct iphdr *)skb->data;
466 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 	struct tcp_sock *tp;
468 	struct inet_sock *inet;
469 	const int type = icmp_hdr(skb)->type;
470 	const int code = icmp_hdr(skb)->code;
471 	struct sock *sk;
472 	struct request_sock *fastopen;
473 	u32 seq, snd_una;
474 	int err;
475 	struct net *net = dev_net(skb->dev);
476 
477 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 				       th->dest, iph->saddr, ntohs(th->source),
479 				       inet_iif(skb), 0);
480 	if (!sk) {
481 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 		return -ENOENT;
483 	}
484 	if (sk->sk_state == TCP_TIME_WAIT) {
485 		inet_twsk_put(inet_twsk(sk));
486 		return 0;
487 	}
488 	seq = ntohl(th->seq);
489 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 				     type == ICMP_TIME_EXCEEDED ||
492 				     (type == ICMP_DEST_UNREACH &&
493 				      (code == ICMP_NET_UNREACH ||
494 				       code == ICMP_HOST_UNREACH)));
495 		return 0;
496 	}
497 
498 	bh_lock_sock(sk);
499 	/* If too many ICMPs get dropped on busy
500 	 * servers this needs to be solved differently.
501 	 * We do take care of PMTU discovery (RFC1191) special case :
502 	 * we can receive locally generated ICMP messages while socket is held.
503 	 */
504 	if (sock_owned_by_user(sk)) {
505 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 	}
508 	if (sk->sk_state == TCP_CLOSE)
509 		goto out;
510 
511 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 		goto out;
514 	}
515 
516 	tp = tcp_sk(sk);
517 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 	fastopen = rcu_dereference(tp->fastopen_rsk);
519 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 	if (sk->sk_state != TCP_LISTEN &&
521 	    !between(seq, snd_una, tp->snd_nxt)) {
522 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 		goto out;
524 	}
525 
526 	switch (type) {
527 	case ICMP_REDIRECT:
528 		if (!sock_owned_by_user(sk))
529 			do_redirect(skb, sk);
530 		goto out;
531 	case ICMP_SOURCE_QUENCH:
532 		/* Just silently ignore these. */
533 		goto out;
534 	case ICMP_PARAMETERPROB:
535 		err = EPROTO;
536 		break;
537 	case ICMP_DEST_UNREACH:
538 		if (code > NR_ICMP_UNREACH)
539 			goto out;
540 
541 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 			/* We are not interested in TCP_LISTEN and open_requests
543 			 * (SYN-ACKs send out by Linux are always <576bytes so
544 			 * they should go through unfragmented).
545 			 */
546 			if (sk->sk_state == TCP_LISTEN)
547 				goto out;
548 
549 			WRITE_ONCE(tp->mtu_info, info);
550 			if (!sock_owned_by_user(sk)) {
551 				tcp_v4_mtu_reduced(sk);
552 			} else {
553 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 					sock_hold(sk);
555 			}
556 			goto out;
557 		}
558 
559 		err = icmp_err_convert[code].errno;
560 		/* check if this ICMP message allows revert of backoff.
561 		 * (see RFC 6069)
562 		 */
563 		if (!fastopen &&
564 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 			tcp_ld_RTO_revert(sk, seq);
566 		break;
567 	case ICMP_TIME_EXCEEDED:
568 		err = EHOSTUNREACH;
569 		break;
570 	default:
571 		goto out;
572 	}
573 
574 	switch (sk->sk_state) {
575 	case TCP_SYN_SENT:
576 	case TCP_SYN_RECV:
577 		/* Only in fast or simultaneous open. If a fast open socket is
578 		 * already accepted it is treated as a connected one below.
579 		 */
580 		if (fastopen && !fastopen->sk)
581 			break;
582 
583 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584 
585 		if (!sock_owned_by_user(sk)) {
586 			sk->sk_err = err;
587 
588 			sk_error_report(sk);
589 
590 			tcp_done(sk);
591 		} else {
592 			sk->sk_err_soft = err;
593 		}
594 		goto out;
595 	}
596 
597 	/* If we've already connected we will keep trying
598 	 * until we time out, or the user gives up.
599 	 *
600 	 * rfc1122 4.2.3.9 allows to consider as hard errors
601 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 	 * but it is obsoleted by pmtu discovery).
603 	 *
604 	 * Note, that in modern internet, where routing is unreliable
605 	 * and in each dark corner broken firewalls sit, sending random
606 	 * errors ordered by their masters even this two messages finally lose
607 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 	 *
609 	 * Now we are in compliance with RFCs.
610 	 *							--ANK (980905)
611 	 */
612 
613 	inet = inet_sk(sk);
614 	if (!sock_owned_by_user(sk) && inet->recverr) {
615 		sk->sk_err = err;
616 		sk_error_report(sk);
617 	} else	{ /* Only an error on timeout */
618 		sk->sk_err_soft = err;
619 	}
620 
621 out:
622 	bh_unlock_sock(sk);
623 	sock_put(sk);
624 	return 0;
625 }
626 
627 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628 {
629 	struct tcphdr *th = tcp_hdr(skb);
630 
631 	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 	skb->csum_start = skb_transport_header(skb) - skb->head;
633 	skb->csum_offset = offsetof(struct tcphdr, check);
634 }
635 
636 /* This routine computes an IPv4 TCP checksum. */
637 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638 {
639 	const struct inet_sock *inet = inet_sk(sk);
640 
641 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642 }
643 EXPORT_SYMBOL(tcp_v4_send_check);
644 
645 /*
646  *	This routine will send an RST to the other tcp.
647  *
648  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649  *		      for reset.
650  *	Answer: if a packet caused RST, it is not for a socket
651  *		existing in our system, if it is matched to a socket,
652  *		it is just duplicate segment or bug in other side's TCP.
653  *		So that we build reply only basing on parameters
654  *		arrived with segment.
655  *	Exception: precedence violation. We do not implement it in any case.
656  */
657 
658 #ifdef CONFIG_TCP_MD5SIG
659 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
660 #else
661 #define OPTION_BYTES sizeof(__be32)
662 #endif
663 
664 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
665 {
666 	const struct tcphdr *th = tcp_hdr(skb);
667 	struct {
668 		struct tcphdr th;
669 		__be32 opt[OPTION_BYTES / sizeof(__be32)];
670 	} rep;
671 	struct ip_reply_arg arg;
672 #ifdef CONFIG_TCP_MD5SIG
673 	struct tcp_md5sig_key *key = NULL;
674 	const __u8 *hash_location = NULL;
675 	unsigned char newhash[16];
676 	int genhash;
677 	struct sock *sk1 = NULL;
678 #endif
679 	u64 transmit_time = 0;
680 	struct sock *ctl_sk;
681 	struct net *net;
682 
683 	/* Never send a reset in response to a reset. */
684 	if (th->rst)
685 		return;
686 
687 	/* If sk not NULL, it means we did a successful lookup and incoming
688 	 * route had to be correct. prequeue might have dropped our dst.
689 	 */
690 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
691 		return;
692 
693 	/* Swap the send and the receive. */
694 	memset(&rep, 0, sizeof(rep));
695 	rep.th.dest   = th->source;
696 	rep.th.source = th->dest;
697 	rep.th.doff   = sizeof(struct tcphdr) / 4;
698 	rep.th.rst    = 1;
699 
700 	if (th->ack) {
701 		rep.th.seq = th->ack_seq;
702 	} else {
703 		rep.th.ack = 1;
704 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 				       skb->len - (th->doff << 2));
706 	}
707 
708 	memset(&arg, 0, sizeof(arg));
709 	arg.iov[0].iov_base = (unsigned char *)&rep;
710 	arg.iov[0].iov_len  = sizeof(rep.th);
711 
712 	net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713 #ifdef CONFIG_TCP_MD5SIG
714 	rcu_read_lock();
715 	hash_location = tcp_parse_md5sig_option(th);
716 	if (sk && sk_fullsock(sk)) {
717 		const union tcp_md5_addr *addr;
718 		int l3index;
719 
720 		/* sdif set, means packet ingressed via a device
721 		 * in an L3 domain and inet_iif is set to it.
722 		 */
723 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
724 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
725 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
726 	} else if (hash_location) {
727 		const union tcp_md5_addr *addr;
728 		int sdif = tcp_v4_sdif(skb);
729 		int dif = inet_iif(skb);
730 		int l3index;
731 
732 		/*
733 		 * active side is lost. Try to find listening socket through
734 		 * source port, and then find md5 key through listening socket.
735 		 * we are not loose security here:
736 		 * Incoming packet is checked with md5 hash with finding key,
737 		 * no RST generated if md5 hash doesn't match.
738 		 */
739 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
740 					     ip_hdr(skb)->saddr,
741 					     th->source, ip_hdr(skb)->daddr,
742 					     ntohs(th->source), dif, sdif);
743 		/* don't send rst if it can't find key */
744 		if (!sk1)
745 			goto out;
746 
747 		/* sdif set, means packet ingressed via a device
748 		 * in an L3 domain and dif is set to it.
749 		 */
750 		l3index = sdif ? dif : 0;
751 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
752 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753 		if (!key)
754 			goto out;
755 
756 
757 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
758 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
759 			goto out;
760 
761 	}
762 
763 	if (key) {
764 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
765 				   (TCPOPT_NOP << 16) |
766 				   (TCPOPT_MD5SIG << 8) |
767 				   TCPOLEN_MD5SIG);
768 		/* Update length and the length the header thinks exists */
769 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770 		rep.th.doff = arg.iov[0].iov_len / 4;
771 
772 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
773 				     key, ip_hdr(skb)->saddr,
774 				     ip_hdr(skb)->daddr, &rep.th);
775 	}
776 #endif
777 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
778 	if (rep.opt[0] == 0) {
779 		__be32 mrst = mptcp_reset_option(skb);
780 
781 		if (mrst) {
782 			rep.opt[0] = mrst;
783 			arg.iov[0].iov_len += sizeof(mrst);
784 			rep.th.doff = arg.iov[0].iov_len / 4;
785 		}
786 	}
787 
788 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 				      ip_hdr(skb)->saddr, /* XXX */
790 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
791 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
793 
794 	/* When socket is gone, all binding information is lost.
795 	 * routing might fail in this case. No choice here, if we choose to force
796 	 * input interface, we will misroute in case of asymmetric route.
797 	 */
798 	if (sk) {
799 		arg.bound_dev_if = sk->sk_bound_dev_if;
800 		if (sk_fullsock(sk))
801 			trace_tcp_send_reset(sk, skb);
802 	}
803 
804 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
805 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
806 
807 	arg.tos = ip_hdr(skb)->tos;
808 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
809 	local_bh_disable();
810 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
811 	if (sk) {
812 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
813 				   inet_twsk(sk)->tw_mark : sk->sk_mark;
814 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
815 				   inet_twsk(sk)->tw_priority : sk->sk_priority;
816 		transmit_time = tcp_transmit_time(sk);
817 	}
818 	ip_send_unicast_reply(ctl_sk,
819 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
820 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821 			      &arg, arg.iov[0].iov_len,
822 			      transmit_time);
823 
824 	ctl_sk->sk_mark = 0;
825 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
826 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
827 	local_bh_enable();
828 
829 #ifdef CONFIG_TCP_MD5SIG
830 out:
831 	rcu_read_unlock();
832 #endif
833 }
834 
835 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
836    outside socket context is ugly, certainly. What can I do?
837  */
838 
839 static void tcp_v4_send_ack(const struct sock *sk,
840 			    struct sk_buff *skb, u32 seq, u32 ack,
841 			    u32 win, u32 tsval, u32 tsecr, int oif,
842 			    struct tcp_md5sig_key *key,
843 			    int reply_flags, u8 tos)
844 {
845 	const struct tcphdr *th = tcp_hdr(skb);
846 	struct {
847 		struct tcphdr th;
848 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
849 #ifdef CONFIG_TCP_MD5SIG
850 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
851 #endif
852 			];
853 	} rep;
854 	struct net *net = sock_net(sk);
855 	struct ip_reply_arg arg;
856 	struct sock *ctl_sk;
857 	u64 transmit_time;
858 
859 	memset(&rep.th, 0, sizeof(struct tcphdr));
860 	memset(&arg, 0, sizeof(arg));
861 
862 	arg.iov[0].iov_base = (unsigned char *)&rep;
863 	arg.iov[0].iov_len  = sizeof(rep.th);
864 	if (tsecr) {
865 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
866 				   (TCPOPT_TIMESTAMP << 8) |
867 				   TCPOLEN_TIMESTAMP);
868 		rep.opt[1] = htonl(tsval);
869 		rep.opt[2] = htonl(tsecr);
870 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
871 	}
872 
873 	/* Swap the send and the receive. */
874 	rep.th.dest    = th->source;
875 	rep.th.source  = th->dest;
876 	rep.th.doff    = arg.iov[0].iov_len / 4;
877 	rep.th.seq     = htonl(seq);
878 	rep.th.ack_seq = htonl(ack);
879 	rep.th.ack     = 1;
880 	rep.th.window  = htons(win);
881 
882 #ifdef CONFIG_TCP_MD5SIG
883 	if (key) {
884 		int offset = (tsecr) ? 3 : 0;
885 
886 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
887 					  (TCPOPT_NOP << 16) |
888 					  (TCPOPT_MD5SIG << 8) |
889 					  TCPOLEN_MD5SIG);
890 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
891 		rep.th.doff = arg.iov[0].iov_len/4;
892 
893 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
894 				    key, ip_hdr(skb)->saddr,
895 				    ip_hdr(skb)->daddr, &rep.th);
896 	}
897 #endif
898 	arg.flags = reply_flags;
899 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
900 				      ip_hdr(skb)->saddr, /* XXX */
901 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
902 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
903 	if (oif)
904 		arg.bound_dev_if = oif;
905 	arg.tos = tos;
906 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
907 	local_bh_disable();
908 	ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
909 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
910 			   inet_twsk(sk)->tw_mark : sk->sk_mark;
911 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
912 			   inet_twsk(sk)->tw_priority : sk->sk_priority;
913 	transmit_time = tcp_transmit_time(sk);
914 	ip_send_unicast_reply(ctl_sk,
915 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
916 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917 			      &arg, arg.iov[0].iov_len,
918 			      transmit_time);
919 
920 	ctl_sk->sk_mark = 0;
921 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
922 	local_bh_enable();
923 }
924 
925 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
926 {
927 	struct inet_timewait_sock *tw = inet_twsk(sk);
928 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
929 
930 	tcp_v4_send_ack(sk, skb,
931 			tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
932 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
933 			tcp_time_stamp_raw() + tcptw->tw_ts_offset,
934 			tcptw->tw_ts_recent,
935 			tw->tw_bound_dev_if,
936 			tcp_twsk_md5_key(tcptw),
937 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
938 			tw->tw_tos
939 			);
940 
941 	inet_twsk_put(tw);
942 }
943 
944 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
945 				  struct request_sock *req)
946 {
947 	const union tcp_md5_addr *addr;
948 	int l3index;
949 
950 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
951 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
952 	 */
953 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
954 					     tcp_sk(sk)->snd_nxt;
955 
956 	/* RFC 7323 2.3
957 	 * The window field (SEG.WND) of every outgoing segment, with the
958 	 * exception of <SYN> segments, MUST be right-shifted by
959 	 * Rcv.Wind.Shift bits:
960 	 */
961 	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
962 	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
963 	tcp_v4_send_ack(sk, skb, seq,
964 			tcp_rsk(req)->rcv_nxt,
965 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
966 			tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
967 			req->ts_recent,
968 			0,
969 			tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
970 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
971 			ip_hdr(skb)->tos);
972 }
973 
974 /*
975  *	Send a SYN-ACK after having received a SYN.
976  *	This still operates on a request_sock only, not on a big
977  *	socket.
978  */
979 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
980 			      struct flowi *fl,
981 			      struct request_sock *req,
982 			      struct tcp_fastopen_cookie *foc,
983 			      enum tcp_synack_type synack_type,
984 			      struct sk_buff *syn_skb)
985 {
986 	const struct inet_request_sock *ireq = inet_rsk(req);
987 	struct flowi4 fl4;
988 	int err = -1;
989 	struct sk_buff *skb;
990 	u8 tos;
991 
992 	/* First, grab a route. */
993 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
994 		return -1;
995 
996 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
997 
998 	if (skb) {
999 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000 
1001 		tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002 				(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003 				(inet_sk(sk)->tos & INET_ECN_MASK) :
1004 				inet_sk(sk)->tos;
1005 
1006 		if (!INET_ECN_is_capable(tos) &&
1007 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1008 			tos |= INET_ECN_ECT_0;
1009 
1010 		rcu_read_lock();
1011 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012 					    ireq->ir_rmt_addr,
1013 					    rcu_dereference(ireq->ireq_opt),
1014 					    tos);
1015 		rcu_read_unlock();
1016 		err = net_xmit_eval(err);
1017 	}
1018 
1019 	return err;
1020 }
1021 
1022 /*
1023  *	IPv4 request_sock destructor.
1024  */
1025 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026 {
1027 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028 }
1029 
1030 #ifdef CONFIG_TCP_MD5SIG
1031 /*
1032  * RFC2385 MD5 checksumming requires a mapping of
1033  * IP address->MD5 Key.
1034  * We need to maintain these in the sk structure.
1035  */
1036 
1037 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038 EXPORT_SYMBOL(tcp_md5_needed);
1039 
1040 /* Find the Key structure for an address.  */
1041 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1042 					   const union tcp_md5_addr *addr,
1043 					   int family)
1044 {
1045 	const struct tcp_sock *tp = tcp_sk(sk);
1046 	struct tcp_md5sig_key *key;
1047 	const struct tcp_md5sig_info *md5sig;
1048 	__be32 mask;
1049 	struct tcp_md5sig_key *best_match = NULL;
1050 	bool match;
1051 
1052 	/* caller either holds rcu_read_lock() or socket lock */
1053 	md5sig = rcu_dereference_check(tp->md5sig_info,
1054 				       lockdep_sock_is_held(sk));
1055 	if (!md5sig)
1056 		return NULL;
1057 
1058 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1059 				 lockdep_sock_is_held(sk)) {
1060 		if (key->family != family)
1061 			continue;
1062 		if (key->l3index && key->l3index != l3index)
1063 			continue;
1064 		if (family == AF_INET) {
1065 			mask = inet_make_mask(key->prefixlen);
1066 			match = (key->addr.a4.s_addr & mask) ==
1067 				(addr->a4.s_addr & mask);
1068 #if IS_ENABLED(CONFIG_IPV6)
1069 		} else if (family == AF_INET6) {
1070 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1071 						  key->prefixlen);
1072 #endif
1073 		} else {
1074 			match = false;
1075 		}
1076 
1077 		if (match && (!best_match ||
1078 			      key->prefixlen > best_match->prefixlen))
1079 			best_match = key;
1080 	}
1081 	return best_match;
1082 }
1083 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1084 
1085 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1086 						      const union tcp_md5_addr *addr,
1087 						      int family, u8 prefixlen,
1088 						      int l3index)
1089 {
1090 	const struct tcp_sock *tp = tcp_sk(sk);
1091 	struct tcp_md5sig_key *key;
1092 	unsigned int size = sizeof(struct in_addr);
1093 	const struct tcp_md5sig_info *md5sig;
1094 
1095 	/* caller either holds rcu_read_lock() or socket lock */
1096 	md5sig = rcu_dereference_check(tp->md5sig_info,
1097 				       lockdep_sock_is_held(sk));
1098 	if (!md5sig)
1099 		return NULL;
1100 #if IS_ENABLED(CONFIG_IPV6)
1101 	if (family == AF_INET6)
1102 		size = sizeof(struct in6_addr);
1103 #endif
1104 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1105 				 lockdep_sock_is_held(sk)) {
1106 		if (key->family != family)
1107 			continue;
1108 		if (key->l3index && key->l3index != l3index)
1109 			continue;
1110 		if (!memcmp(&key->addr, addr, size) &&
1111 		    key->prefixlen == prefixlen)
1112 			return key;
1113 	}
1114 	return NULL;
1115 }
1116 
1117 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1118 					 const struct sock *addr_sk)
1119 {
1120 	const union tcp_md5_addr *addr;
1121 	int l3index;
1122 
1123 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1124 						 addr_sk->sk_bound_dev_if);
1125 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1126 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1127 }
1128 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1129 
1130 /* This can be called on a newly created socket, from other files */
1131 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1132 		   int family, u8 prefixlen, int l3index,
1133 		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
1134 {
1135 	/* Add Key to the list */
1136 	struct tcp_md5sig_key *key;
1137 	struct tcp_sock *tp = tcp_sk(sk);
1138 	struct tcp_md5sig_info *md5sig;
1139 
1140 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1141 	if (key) {
1142 		/* Pre-existing entry - just update that one.
1143 		 * Note that the key might be used concurrently.
1144 		 * data_race() is telling kcsan that we do not care of
1145 		 * key mismatches, since changing MD5 key on live flows
1146 		 * can lead to packet drops.
1147 		 */
1148 		data_race(memcpy(key->key, newkey, newkeylen));
1149 
1150 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1151 		 * Also note that a reader could catch new key->keylen value
1152 		 * but old key->key[], this is the reason we use __GFP_ZERO
1153 		 * at sock_kmalloc() time below these lines.
1154 		 */
1155 		WRITE_ONCE(key->keylen, newkeylen);
1156 
1157 		return 0;
1158 	}
1159 
1160 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1161 					   lockdep_sock_is_held(sk));
1162 	if (!md5sig) {
1163 		md5sig = kmalloc(sizeof(*md5sig), gfp);
1164 		if (!md5sig)
1165 			return -ENOMEM;
1166 
1167 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1168 		INIT_HLIST_HEAD(&md5sig->head);
1169 		rcu_assign_pointer(tp->md5sig_info, md5sig);
1170 	}
1171 
1172 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1173 	if (!key)
1174 		return -ENOMEM;
1175 	if (!tcp_alloc_md5sig_pool()) {
1176 		sock_kfree_s(sk, key, sizeof(*key));
1177 		return -ENOMEM;
1178 	}
1179 
1180 	memcpy(key->key, newkey, newkeylen);
1181 	key->keylen = newkeylen;
1182 	key->family = family;
1183 	key->prefixlen = prefixlen;
1184 	key->l3index = l3index;
1185 	memcpy(&key->addr, addr,
1186 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1187 				      sizeof(struct in_addr));
1188 	hlist_add_head_rcu(&key->node, &md5sig->head);
1189 	return 0;
1190 }
1191 EXPORT_SYMBOL(tcp_md5_do_add);
1192 
1193 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1194 		   u8 prefixlen, int l3index)
1195 {
1196 	struct tcp_md5sig_key *key;
1197 
1198 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1199 	if (!key)
1200 		return -ENOENT;
1201 	hlist_del_rcu(&key->node);
1202 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1203 	kfree_rcu(key, rcu);
1204 	return 0;
1205 }
1206 EXPORT_SYMBOL(tcp_md5_do_del);
1207 
1208 static void tcp_clear_md5_list(struct sock *sk)
1209 {
1210 	struct tcp_sock *tp = tcp_sk(sk);
1211 	struct tcp_md5sig_key *key;
1212 	struct hlist_node *n;
1213 	struct tcp_md5sig_info *md5sig;
1214 
1215 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1216 
1217 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1218 		hlist_del_rcu(&key->node);
1219 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1220 		kfree_rcu(key, rcu);
1221 	}
1222 }
1223 
1224 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1225 				 sockptr_t optval, int optlen)
1226 {
1227 	struct tcp_md5sig cmd;
1228 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1229 	const union tcp_md5_addr *addr;
1230 	u8 prefixlen = 32;
1231 	int l3index = 0;
1232 
1233 	if (optlen < sizeof(cmd))
1234 		return -EINVAL;
1235 
1236 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1237 		return -EFAULT;
1238 
1239 	if (sin->sin_family != AF_INET)
1240 		return -EINVAL;
1241 
1242 	if (optname == TCP_MD5SIG_EXT &&
1243 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1244 		prefixlen = cmd.tcpm_prefixlen;
1245 		if (prefixlen > 32)
1246 			return -EINVAL;
1247 	}
1248 
1249 	if (optname == TCP_MD5SIG_EXT &&
1250 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1251 		struct net_device *dev;
1252 
1253 		rcu_read_lock();
1254 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1255 		if (dev && netif_is_l3_master(dev))
1256 			l3index = dev->ifindex;
1257 
1258 		rcu_read_unlock();
1259 
1260 		/* ok to reference set/not set outside of rcu;
1261 		 * right now device MUST be an L3 master
1262 		 */
1263 		if (!dev || !l3index)
1264 			return -EINVAL;
1265 	}
1266 
1267 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1268 
1269 	if (!cmd.tcpm_keylen)
1270 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1271 
1272 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1273 		return -EINVAL;
1274 
1275 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1276 			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1277 }
1278 
1279 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1280 				   __be32 daddr, __be32 saddr,
1281 				   const struct tcphdr *th, int nbytes)
1282 {
1283 	struct tcp4_pseudohdr *bp;
1284 	struct scatterlist sg;
1285 	struct tcphdr *_th;
1286 
1287 	bp = hp->scratch;
1288 	bp->saddr = saddr;
1289 	bp->daddr = daddr;
1290 	bp->pad = 0;
1291 	bp->protocol = IPPROTO_TCP;
1292 	bp->len = cpu_to_be16(nbytes);
1293 
1294 	_th = (struct tcphdr *)(bp + 1);
1295 	memcpy(_th, th, sizeof(*th));
1296 	_th->check = 0;
1297 
1298 	sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1299 	ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1300 				sizeof(*bp) + sizeof(*th));
1301 	return crypto_ahash_update(hp->md5_req);
1302 }
1303 
1304 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1305 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1306 {
1307 	struct tcp_md5sig_pool *hp;
1308 	struct ahash_request *req;
1309 
1310 	hp = tcp_get_md5sig_pool();
1311 	if (!hp)
1312 		goto clear_hash_noput;
1313 	req = hp->md5_req;
1314 
1315 	if (crypto_ahash_init(req))
1316 		goto clear_hash;
1317 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1318 		goto clear_hash;
1319 	if (tcp_md5_hash_key(hp, key))
1320 		goto clear_hash;
1321 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1322 	if (crypto_ahash_final(req))
1323 		goto clear_hash;
1324 
1325 	tcp_put_md5sig_pool();
1326 	return 0;
1327 
1328 clear_hash:
1329 	tcp_put_md5sig_pool();
1330 clear_hash_noput:
1331 	memset(md5_hash, 0, 16);
1332 	return 1;
1333 }
1334 
1335 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1336 			const struct sock *sk,
1337 			const struct sk_buff *skb)
1338 {
1339 	struct tcp_md5sig_pool *hp;
1340 	struct ahash_request *req;
1341 	const struct tcphdr *th = tcp_hdr(skb);
1342 	__be32 saddr, daddr;
1343 
1344 	if (sk) { /* valid for establish/request sockets */
1345 		saddr = sk->sk_rcv_saddr;
1346 		daddr = sk->sk_daddr;
1347 	} else {
1348 		const struct iphdr *iph = ip_hdr(skb);
1349 		saddr = iph->saddr;
1350 		daddr = iph->daddr;
1351 	}
1352 
1353 	hp = tcp_get_md5sig_pool();
1354 	if (!hp)
1355 		goto clear_hash_noput;
1356 	req = hp->md5_req;
1357 
1358 	if (crypto_ahash_init(req))
1359 		goto clear_hash;
1360 
1361 	if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1362 		goto clear_hash;
1363 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1364 		goto clear_hash;
1365 	if (tcp_md5_hash_key(hp, key))
1366 		goto clear_hash;
1367 	ahash_request_set_crypt(req, NULL, md5_hash, 0);
1368 	if (crypto_ahash_final(req))
1369 		goto clear_hash;
1370 
1371 	tcp_put_md5sig_pool();
1372 	return 0;
1373 
1374 clear_hash:
1375 	tcp_put_md5sig_pool();
1376 clear_hash_noput:
1377 	memset(md5_hash, 0, 16);
1378 	return 1;
1379 }
1380 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1381 
1382 #endif
1383 
1384 /* Called with rcu_read_lock() */
1385 static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1386 				    const struct sk_buff *skb,
1387 				    int dif, int sdif)
1388 {
1389 #ifdef CONFIG_TCP_MD5SIG
1390 	/*
1391 	 * This gets called for each TCP segment that arrives
1392 	 * so we want to be efficient.
1393 	 * We have 3 drop cases:
1394 	 * o No MD5 hash and one expected.
1395 	 * o MD5 hash and we're not expecting one.
1396 	 * o MD5 hash and its wrong.
1397 	 */
1398 	const __u8 *hash_location = NULL;
1399 	struct tcp_md5sig_key *hash_expected;
1400 	const struct iphdr *iph = ip_hdr(skb);
1401 	const struct tcphdr *th = tcp_hdr(skb);
1402 	const union tcp_md5_addr *addr;
1403 	unsigned char newhash[16];
1404 	int genhash, l3index;
1405 
1406 	/* sdif set, means packet ingressed via a device
1407 	 * in an L3 domain and dif is set to the l3mdev
1408 	 */
1409 	l3index = sdif ? dif : 0;
1410 
1411 	addr = (union tcp_md5_addr *)&iph->saddr;
1412 	hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1413 	hash_location = tcp_parse_md5sig_option(th);
1414 
1415 	/* We've parsed the options - do we have a hash? */
1416 	if (!hash_expected && !hash_location)
1417 		return false;
1418 
1419 	if (hash_expected && !hash_location) {
1420 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1421 		return true;
1422 	}
1423 
1424 	if (!hash_expected && hash_location) {
1425 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1426 		return true;
1427 	}
1428 
1429 	/* Okay, so this is hash_expected and hash_location -
1430 	 * so we need to calculate the checksum.
1431 	 */
1432 	genhash = tcp_v4_md5_hash_skb(newhash,
1433 				      hash_expected,
1434 				      NULL, skb);
1435 
1436 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1437 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1438 		net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1439 				     &iph->saddr, ntohs(th->source),
1440 				     &iph->daddr, ntohs(th->dest),
1441 				     genhash ? " tcp_v4_calc_md5_hash failed"
1442 				     : "", l3index);
1443 		return true;
1444 	}
1445 	return false;
1446 #endif
1447 	return false;
1448 }
1449 
1450 static void tcp_v4_init_req(struct request_sock *req,
1451 			    const struct sock *sk_listener,
1452 			    struct sk_buff *skb)
1453 {
1454 	struct inet_request_sock *ireq = inet_rsk(req);
1455 	struct net *net = sock_net(sk_listener);
1456 
1457 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1458 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1459 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1460 }
1461 
1462 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1463 					  struct sk_buff *skb,
1464 					  struct flowi *fl,
1465 					  struct request_sock *req)
1466 {
1467 	tcp_v4_init_req(req, sk, skb);
1468 
1469 	if (security_inet_conn_request(sk, skb, req))
1470 		return NULL;
1471 
1472 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1473 }
1474 
1475 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1476 	.family		=	PF_INET,
1477 	.obj_size	=	sizeof(struct tcp_request_sock),
1478 	.rtx_syn_ack	=	tcp_rtx_synack,
1479 	.send_ack	=	tcp_v4_reqsk_send_ack,
1480 	.destructor	=	tcp_v4_reqsk_destructor,
1481 	.send_reset	=	tcp_v4_send_reset,
1482 	.syn_ack_timeout =	tcp_syn_ack_timeout,
1483 };
1484 
1485 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1486 	.mss_clamp	=	TCP_MSS_DEFAULT,
1487 #ifdef CONFIG_TCP_MD5SIG
1488 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1489 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1490 #endif
1491 #ifdef CONFIG_SYN_COOKIES
1492 	.cookie_init_seq =	cookie_v4_init_sequence,
1493 #endif
1494 	.route_req	=	tcp_v4_route_req,
1495 	.init_seq	=	tcp_v4_init_seq,
1496 	.init_ts_off	=	tcp_v4_init_ts_off,
1497 	.send_synack	=	tcp_v4_send_synack,
1498 };
1499 
1500 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1501 {
1502 	/* Never answer to SYNs send to broadcast or multicast */
1503 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1504 		goto drop;
1505 
1506 	return tcp_conn_request(&tcp_request_sock_ops,
1507 				&tcp_request_sock_ipv4_ops, sk, skb);
1508 
1509 drop:
1510 	tcp_listendrop(sk);
1511 	return 0;
1512 }
1513 EXPORT_SYMBOL(tcp_v4_conn_request);
1514 
1515 
1516 /*
1517  * The three way handshake has completed - we got a valid synack -
1518  * now create the new socket.
1519  */
1520 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1521 				  struct request_sock *req,
1522 				  struct dst_entry *dst,
1523 				  struct request_sock *req_unhash,
1524 				  bool *own_req)
1525 {
1526 	struct inet_request_sock *ireq;
1527 	bool found_dup_sk = false;
1528 	struct inet_sock *newinet;
1529 	struct tcp_sock *newtp;
1530 	struct sock *newsk;
1531 #ifdef CONFIG_TCP_MD5SIG
1532 	const union tcp_md5_addr *addr;
1533 	struct tcp_md5sig_key *key;
1534 	int l3index;
1535 #endif
1536 	struct ip_options_rcu *inet_opt;
1537 
1538 	if (sk_acceptq_is_full(sk))
1539 		goto exit_overflow;
1540 
1541 	newsk = tcp_create_openreq_child(sk, req, skb);
1542 	if (!newsk)
1543 		goto exit_nonewsk;
1544 
1545 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1546 	inet_sk_rx_dst_set(newsk, skb);
1547 
1548 	newtp		      = tcp_sk(newsk);
1549 	newinet		      = inet_sk(newsk);
1550 	ireq		      = inet_rsk(req);
1551 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
1552 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1553 	newsk->sk_bound_dev_if = ireq->ir_iif;
1554 	newinet->inet_saddr   = ireq->ir_loc_addr;
1555 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1556 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1557 	newinet->mc_index     = inet_iif(skb);
1558 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1559 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1560 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1561 	if (inet_opt)
1562 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1563 	newinet->inet_id = prandom_u32();
1564 
1565 	/* Set ToS of the new socket based upon the value of incoming SYN.
1566 	 * ECT bits are set later in tcp_init_transfer().
1567 	 */
1568 	if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1569 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1570 
1571 	if (!dst) {
1572 		dst = inet_csk_route_child_sock(sk, newsk, req);
1573 		if (!dst)
1574 			goto put_and_exit;
1575 	} else {
1576 		/* syncookie case : see end of cookie_v4_check() */
1577 	}
1578 	sk_setup_caps(newsk, dst);
1579 
1580 	tcp_ca_openreq_child(newsk, dst);
1581 
1582 	tcp_sync_mss(newsk, dst_mtu(dst));
1583 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1584 
1585 	tcp_initialize_rcv_mss(newsk);
1586 
1587 #ifdef CONFIG_TCP_MD5SIG
1588 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1589 	/* Copy over the MD5 key from the original socket */
1590 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1591 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1592 	if (key) {
1593 		/*
1594 		 * We're using one, so create a matching key
1595 		 * on the newsk structure. If we fail to get
1596 		 * memory, then we end up not copying the key
1597 		 * across. Shucks.
1598 		 */
1599 		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1600 			       key->key, key->keylen, GFP_ATOMIC);
1601 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1602 	}
1603 #endif
1604 
1605 	if (__inet_inherit_port(sk, newsk) < 0)
1606 		goto put_and_exit;
1607 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1608 				       &found_dup_sk);
1609 	if (likely(*own_req)) {
1610 		tcp_move_syn(newtp, req);
1611 		ireq->ireq_opt = NULL;
1612 	} else {
1613 		newinet->inet_opt = NULL;
1614 
1615 		if (!req_unhash && found_dup_sk) {
1616 			/* This code path should only be executed in the
1617 			 * syncookie case only
1618 			 */
1619 			bh_unlock_sock(newsk);
1620 			sock_put(newsk);
1621 			newsk = NULL;
1622 		}
1623 	}
1624 	return newsk;
1625 
1626 exit_overflow:
1627 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1628 exit_nonewsk:
1629 	dst_release(dst);
1630 exit:
1631 	tcp_listendrop(sk);
1632 	return NULL;
1633 put_and_exit:
1634 	newinet->inet_opt = NULL;
1635 	inet_csk_prepare_forced_close(newsk);
1636 	tcp_done(newsk);
1637 	goto exit;
1638 }
1639 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1640 
1641 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1642 {
1643 #ifdef CONFIG_SYN_COOKIES
1644 	const struct tcphdr *th = tcp_hdr(skb);
1645 
1646 	if (!th->syn)
1647 		sk = cookie_v4_check(sk, skb);
1648 #endif
1649 	return sk;
1650 }
1651 
1652 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1653 			 struct tcphdr *th, u32 *cookie)
1654 {
1655 	u16 mss = 0;
1656 #ifdef CONFIG_SYN_COOKIES
1657 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1658 				    &tcp_request_sock_ipv4_ops, sk, th);
1659 	if (mss) {
1660 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1661 		tcp_synq_overflow(sk);
1662 	}
1663 #endif
1664 	return mss;
1665 }
1666 
1667 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1668 							   u32));
1669 /* The socket must have it's spinlock held when we get
1670  * here, unless it is a TCP_LISTEN socket.
1671  *
1672  * We have a potential double-lock case here, so even when
1673  * doing backlog processing we use the BH locking scheme.
1674  * This is because we cannot sleep with the original spinlock
1675  * held.
1676  */
1677 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1678 {
1679 	struct sock *rsk;
1680 
1681 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1682 		struct dst_entry *dst = sk->sk_rx_dst;
1683 
1684 		sock_rps_save_rxhash(sk, skb);
1685 		sk_mark_napi_id(sk, skb);
1686 		if (dst) {
1687 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1688 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1689 					     dst, 0)) {
1690 				dst_release(dst);
1691 				sk->sk_rx_dst = NULL;
1692 			}
1693 		}
1694 		tcp_rcv_established(sk, skb);
1695 		return 0;
1696 	}
1697 
1698 	if (tcp_checksum_complete(skb))
1699 		goto csum_err;
1700 
1701 	if (sk->sk_state == TCP_LISTEN) {
1702 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1703 
1704 		if (!nsk)
1705 			goto discard;
1706 		if (nsk != sk) {
1707 			if (tcp_child_process(sk, nsk, skb)) {
1708 				rsk = nsk;
1709 				goto reset;
1710 			}
1711 			return 0;
1712 		}
1713 	} else
1714 		sock_rps_save_rxhash(sk, skb);
1715 
1716 	if (tcp_rcv_state_process(sk, skb)) {
1717 		rsk = sk;
1718 		goto reset;
1719 	}
1720 	return 0;
1721 
1722 reset:
1723 	tcp_v4_send_reset(rsk, skb);
1724 discard:
1725 	kfree_skb(skb);
1726 	/* Be careful here. If this function gets more complicated and
1727 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1728 	 * might be destroyed here. This current version compiles correctly,
1729 	 * but you have been warned.
1730 	 */
1731 	return 0;
1732 
1733 csum_err:
1734 	trace_tcp_bad_csum(skb);
1735 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1736 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1737 	goto discard;
1738 }
1739 EXPORT_SYMBOL(tcp_v4_do_rcv);
1740 
1741 int tcp_v4_early_demux(struct sk_buff *skb)
1742 {
1743 	const struct iphdr *iph;
1744 	const struct tcphdr *th;
1745 	struct sock *sk;
1746 
1747 	if (skb->pkt_type != PACKET_HOST)
1748 		return 0;
1749 
1750 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1751 		return 0;
1752 
1753 	iph = ip_hdr(skb);
1754 	th = tcp_hdr(skb);
1755 
1756 	if (th->doff < sizeof(struct tcphdr) / 4)
1757 		return 0;
1758 
1759 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1760 				       iph->saddr, th->source,
1761 				       iph->daddr, ntohs(th->dest),
1762 				       skb->skb_iif, inet_sdif(skb));
1763 	if (sk) {
1764 		skb->sk = sk;
1765 		skb->destructor = sock_edemux;
1766 		if (sk_fullsock(sk)) {
1767 			struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1768 
1769 			if (dst)
1770 				dst = dst_check(dst, 0);
1771 			if (dst &&
1772 			    inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1773 				skb_dst_set_noref(skb, dst);
1774 		}
1775 	}
1776 	return 0;
1777 }
1778 
1779 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1780 {
1781 	u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1782 	u32 tail_gso_size, tail_gso_segs;
1783 	struct skb_shared_info *shinfo;
1784 	const struct tcphdr *th;
1785 	struct tcphdr *thtail;
1786 	struct sk_buff *tail;
1787 	unsigned int hdrlen;
1788 	bool fragstolen;
1789 	u32 gso_segs;
1790 	u32 gso_size;
1791 	int delta;
1792 
1793 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1794 	 * we can fix skb->truesize to its real value to avoid future drops.
1795 	 * This is valid because skb is not yet charged to the socket.
1796 	 * It has been noticed pure SACK packets were sometimes dropped
1797 	 * (if cooked by drivers without copybreak feature).
1798 	 */
1799 	skb_condense(skb);
1800 
1801 	skb_dst_drop(skb);
1802 
1803 	if (unlikely(tcp_checksum_complete(skb))) {
1804 		bh_unlock_sock(sk);
1805 		trace_tcp_bad_csum(skb);
1806 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1807 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1808 		return true;
1809 	}
1810 
1811 	/* Attempt coalescing to last skb in backlog, even if we are
1812 	 * above the limits.
1813 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1814 	 */
1815 	th = (const struct tcphdr *)skb->data;
1816 	hdrlen = th->doff * 4;
1817 
1818 	tail = sk->sk_backlog.tail;
1819 	if (!tail)
1820 		goto no_coalesce;
1821 	thtail = (struct tcphdr *)tail->data;
1822 
1823 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1824 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1825 	    ((TCP_SKB_CB(tail)->tcp_flags |
1826 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1827 	    !((TCP_SKB_CB(tail)->tcp_flags &
1828 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1829 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1830 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1831 #ifdef CONFIG_TLS_DEVICE
1832 	    tail->decrypted != skb->decrypted ||
1833 #endif
1834 	    thtail->doff != th->doff ||
1835 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1836 		goto no_coalesce;
1837 
1838 	__skb_pull(skb, hdrlen);
1839 
1840 	shinfo = skb_shinfo(skb);
1841 	gso_size = shinfo->gso_size ?: skb->len;
1842 	gso_segs = shinfo->gso_segs ?: 1;
1843 
1844 	shinfo = skb_shinfo(tail);
1845 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1846 	tail_gso_segs = shinfo->gso_segs ?: 1;
1847 
1848 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1849 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1850 
1851 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1852 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1853 			thtail->window = th->window;
1854 		}
1855 
1856 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1857 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1858 		 * is not entered if we append a packet with a FIN.
1859 		 * SYN, RST, URG are not present.
1860 		 * ACK is set on both packets.
1861 		 * PSH : we do not really care in TCP stack,
1862 		 *       at least for 'GRO' packets.
1863 		 */
1864 		thtail->fin |= th->fin;
1865 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1866 
1867 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1868 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1869 			tail->tstamp = skb->tstamp;
1870 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1871 		}
1872 
1873 		/* Not as strict as GRO. We only need to carry mss max value */
1874 		shinfo->gso_size = max(gso_size, tail_gso_size);
1875 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1876 
1877 		sk->sk_backlog.len += delta;
1878 		__NET_INC_STATS(sock_net(sk),
1879 				LINUX_MIB_TCPBACKLOGCOALESCE);
1880 		kfree_skb_partial(skb, fragstolen);
1881 		return false;
1882 	}
1883 	__skb_push(skb, hdrlen);
1884 
1885 no_coalesce:
1886 	/* Only socket owner can try to collapse/prune rx queues
1887 	 * to reduce memory overhead, so add a little headroom here.
1888 	 * Few sockets backlog are possibly concurrently non empty.
1889 	 */
1890 	limit += 64*1024;
1891 
1892 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
1893 		bh_unlock_sock(sk);
1894 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1895 		return true;
1896 	}
1897 	return false;
1898 }
1899 EXPORT_SYMBOL(tcp_add_backlog);
1900 
1901 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1902 {
1903 	struct tcphdr *th = (struct tcphdr *)skb->data;
1904 
1905 	return sk_filter_trim_cap(sk, skb, th->doff * 4);
1906 }
1907 EXPORT_SYMBOL(tcp_filter);
1908 
1909 static void tcp_v4_restore_cb(struct sk_buff *skb)
1910 {
1911 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1912 		sizeof(struct inet_skb_parm));
1913 }
1914 
1915 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1916 			   const struct tcphdr *th)
1917 {
1918 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1919 	 * barrier() makes sure compiler wont play fool^Waliasing games.
1920 	 */
1921 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1922 		sizeof(struct inet_skb_parm));
1923 	barrier();
1924 
1925 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1926 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1927 				    skb->len - th->doff * 4);
1928 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1929 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1930 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1931 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1932 	TCP_SKB_CB(skb)->sacked	 = 0;
1933 	TCP_SKB_CB(skb)->has_rxtstamp =
1934 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1935 }
1936 
1937 /*
1938  *	From tcp_input.c
1939  */
1940 
1941 int tcp_v4_rcv(struct sk_buff *skb)
1942 {
1943 	struct net *net = dev_net(skb->dev);
1944 	struct sk_buff *skb_to_free;
1945 	int sdif = inet_sdif(skb);
1946 	int dif = inet_iif(skb);
1947 	const struct iphdr *iph;
1948 	const struct tcphdr *th;
1949 	bool refcounted;
1950 	struct sock *sk;
1951 	int ret;
1952 
1953 	if (skb->pkt_type != PACKET_HOST)
1954 		goto discard_it;
1955 
1956 	/* Count it even if it's bad */
1957 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
1958 
1959 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1960 		goto discard_it;
1961 
1962 	th = (const struct tcphdr *)skb->data;
1963 
1964 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1965 		goto bad_packet;
1966 	if (!pskb_may_pull(skb, th->doff * 4))
1967 		goto discard_it;
1968 
1969 	/* An explanation is required here, I think.
1970 	 * Packet length and doff are validated by header prediction,
1971 	 * provided case of th->doff==0 is eliminated.
1972 	 * So, we defer the checks. */
1973 
1974 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1975 		goto csum_error;
1976 
1977 	th = (const struct tcphdr *)skb->data;
1978 	iph = ip_hdr(skb);
1979 lookup:
1980 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1981 			       th->dest, sdif, &refcounted);
1982 	if (!sk)
1983 		goto no_tcp_socket;
1984 
1985 process:
1986 	if (sk->sk_state == TCP_TIME_WAIT)
1987 		goto do_time_wait;
1988 
1989 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
1990 		struct request_sock *req = inet_reqsk(sk);
1991 		bool req_stolen = false;
1992 		struct sock *nsk;
1993 
1994 		sk = req->rsk_listener;
1995 		if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
1996 			sk_drops_add(sk, skb);
1997 			reqsk_put(req);
1998 			goto discard_it;
1999 		}
2000 		if (tcp_checksum_complete(skb)) {
2001 			reqsk_put(req);
2002 			goto csum_error;
2003 		}
2004 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2005 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2006 			if (!nsk) {
2007 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2008 				goto lookup;
2009 			}
2010 			sk = nsk;
2011 			/* reuseport_migrate_sock() has already held one sk_refcnt
2012 			 * before returning.
2013 			 */
2014 		} else {
2015 			/* We own a reference on the listener, increase it again
2016 			 * as we might lose it too soon.
2017 			 */
2018 			sock_hold(sk);
2019 		}
2020 		refcounted = true;
2021 		nsk = NULL;
2022 		if (!tcp_filter(sk, skb)) {
2023 			th = (const struct tcphdr *)skb->data;
2024 			iph = ip_hdr(skb);
2025 			tcp_v4_fill_cb(skb, iph, th);
2026 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2027 		}
2028 		if (!nsk) {
2029 			reqsk_put(req);
2030 			if (req_stolen) {
2031 				/* Another cpu got exclusive access to req
2032 				 * and created a full blown socket.
2033 				 * Try to feed this packet to this socket
2034 				 * instead of discarding it.
2035 				 */
2036 				tcp_v4_restore_cb(skb);
2037 				sock_put(sk);
2038 				goto lookup;
2039 			}
2040 			goto discard_and_relse;
2041 		}
2042 		if (nsk == sk) {
2043 			reqsk_put(req);
2044 			tcp_v4_restore_cb(skb);
2045 		} else if (tcp_child_process(sk, nsk, skb)) {
2046 			tcp_v4_send_reset(nsk, skb);
2047 			goto discard_and_relse;
2048 		} else {
2049 			sock_put(sk);
2050 			return 0;
2051 		}
2052 	}
2053 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2054 		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2055 		goto discard_and_relse;
2056 	}
2057 
2058 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2059 		goto discard_and_relse;
2060 
2061 	if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2062 		goto discard_and_relse;
2063 
2064 	nf_reset_ct(skb);
2065 
2066 	if (tcp_filter(sk, skb))
2067 		goto discard_and_relse;
2068 	th = (const struct tcphdr *)skb->data;
2069 	iph = ip_hdr(skb);
2070 	tcp_v4_fill_cb(skb, iph, th);
2071 
2072 	skb->dev = NULL;
2073 
2074 	if (sk->sk_state == TCP_LISTEN) {
2075 		ret = tcp_v4_do_rcv(sk, skb);
2076 		goto put_and_return;
2077 	}
2078 
2079 	sk_incoming_cpu_update(sk);
2080 
2081 	bh_lock_sock_nested(sk);
2082 	tcp_segs_in(tcp_sk(sk), skb);
2083 	ret = 0;
2084 	if (!sock_owned_by_user(sk)) {
2085 		skb_to_free = sk->sk_rx_skb_cache;
2086 		sk->sk_rx_skb_cache = NULL;
2087 		ret = tcp_v4_do_rcv(sk, skb);
2088 	} else {
2089 		if (tcp_add_backlog(sk, skb))
2090 			goto discard_and_relse;
2091 		skb_to_free = NULL;
2092 	}
2093 	bh_unlock_sock(sk);
2094 	if (skb_to_free)
2095 		__kfree_skb(skb_to_free);
2096 
2097 put_and_return:
2098 	if (refcounted)
2099 		sock_put(sk);
2100 
2101 	return ret;
2102 
2103 no_tcp_socket:
2104 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2105 		goto discard_it;
2106 
2107 	tcp_v4_fill_cb(skb, iph, th);
2108 
2109 	if (tcp_checksum_complete(skb)) {
2110 csum_error:
2111 		trace_tcp_bad_csum(skb);
2112 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2113 bad_packet:
2114 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2115 	} else {
2116 		tcp_v4_send_reset(NULL, skb);
2117 	}
2118 
2119 discard_it:
2120 	/* Discard frame. */
2121 	kfree_skb(skb);
2122 	return 0;
2123 
2124 discard_and_relse:
2125 	sk_drops_add(sk, skb);
2126 	if (refcounted)
2127 		sock_put(sk);
2128 	goto discard_it;
2129 
2130 do_time_wait:
2131 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2132 		inet_twsk_put(inet_twsk(sk));
2133 		goto discard_it;
2134 	}
2135 
2136 	tcp_v4_fill_cb(skb, iph, th);
2137 
2138 	if (tcp_checksum_complete(skb)) {
2139 		inet_twsk_put(inet_twsk(sk));
2140 		goto csum_error;
2141 	}
2142 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2143 	case TCP_TW_SYN: {
2144 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2145 							&tcp_hashinfo, skb,
2146 							__tcp_hdrlen(th),
2147 							iph->saddr, th->source,
2148 							iph->daddr, th->dest,
2149 							inet_iif(skb),
2150 							sdif);
2151 		if (sk2) {
2152 			inet_twsk_deschedule_put(inet_twsk(sk));
2153 			sk = sk2;
2154 			tcp_v4_restore_cb(skb);
2155 			refcounted = false;
2156 			goto process;
2157 		}
2158 	}
2159 		/* to ACK */
2160 		fallthrough;
2161 	case TCP_TW_ACK:
2162 		tcp_v4_timewait_ack(sk, skb);
2163 		break;
2164 	case TCP_TW_RST:
2165 		tcp_v4_send_reset(sk, skb);
2166 		inet_twsk_deschedule_put(inet_twsk(sk));
2167 		goto discard_it;
2168 	case TCP_TW_SUCCESS:;
2169 	}
2170 	goto discard_it;
2171 }
2172 
2173 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2174 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2175 	.twsk_unique	= tcp_twsk_unique,
2176 	.twsk_destructor= tcp_twsk_destructor,
2177 };
2178 
2179 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2180 {
2181 	struct dst_entry *dst = skb_dst(skb);
2182 
2183 	if (dst && dst_hold_safe(dst)) {
2184 		sk->sk_rx_dst = dst;
2185 		inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2186 	}
2187 }
2188 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2189 
2190 const struct inet_connection_sock_af_ops ipv4_specific = {
2191 	.queue_xmit	   = ip_queue_xmit,
2192 	.send_check	   = tcp_v4_send_check,
2193 	.rebuild_header	   = inet_sk_rebuild_header,
2194 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2195 	.conn_request	   = tcp_v4_conn_request,
2196 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2197 	.net_header_len	   = sizeof(struct iphdr),
2198 	.setsockopt	   = ip_setsockopt,
2199 	.getsockopt	   = ip_getsockopt,
2200 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
2201 	.sockaddr_len	   = sizeof(struct sockaddr_in),
2202 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2203 };
2204 EXPORT_SYMBOL(ipv4_specific);
2205 
2206 #ifdef CONFIG_TCP_MD5SIG
2207 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2208 	.md5_lookup		= tcp_v4_md5_lookup,
2209 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2210 	.md5_parse		= tcp_v4_parse_md5_keys,
2211 };
2212 #endif
2213 
2214 /* NOTE: A lot of things set to zero explicitly by call to
2215  *       sk_alloc() so need not be done here.
2216  */
2217 static int tcp_v4_init_sock(struct sock *sk)
2218 {
2219 	struct inet_connection_sock *icsk = inet_csk(sk);
2220 
2221 	tcp_init_sock(sk);
2222 
2223 	icsk->icsk_af_ops = &ipv4_specific;
2224 
2225 #ifdef CONFIG_TCP_MD5SIG
2226 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2227 #endif
2228 
2229 	return 0;
2230 }
2231 
2232 void tcp_v4_destroy_sock(struct sock *sk)
2233 {
2234 	struct tcp_sock *tp = tcp_sk(sk);
2235 
2236 	trace_tcp_destroy_sock(sk);
2237 
2238 	tcp_clear_xmit_timers(sk);
2239 
2240 	tcp_cleanup_congestion_control(sk);
2241 
2242 	tcp_cleanup_ulp(sk);
2243 
2244 	/* Cleanup up the write buffer. */
2245 	tcp_write_queue_purge(sk);
2246 
2247 	/* Check if we want to disable active TFO */
2248 	tcp_fastopen_active_disable_ofo_check(sk);
2249 
2250 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2251 	skb_rbtree_purge(&tp->out_of_order_queue);
2252 
2253 #ifdef CONFIG_TCP_MD5SIG
2254 	/* Clean up the MD5 key list, if any */
2255 	if (tp->md5sig_info) {
2256 		tcp_clear_md5_list(sk);
2257 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2258 		tp->md5sig_info = NULL;
2259 	}
2260 #endif
2261 
2262 	/* Clean up a referenced TCP bind bucket. */
2263 	if (inet_csk(sk)->icsk_bind_hash)
2264 		inet_put_port(sk);
2265 
2266 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2267 
2268 	/* If socket is aborted during connect operation */
2269 	tcp_free_fastopen_req(tp);
2270 	tcp_fastopen_destroy_cipher(sk);
2271 	tcp_saved_syn_free(tp);
2272 
2273 	sk_sockets_allocated_dec(sk);
2274 }
2275 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2276 
2277 #ifdef CONFIG_PROC_FS
2278 /* Proc filesystem TCP sock list dumping. */
2279 
2280 static unsigned short seq_file_family(const struct seq_file *seq);
2281 
2282 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2283 {
2284 	unsigned short family = seq_file_family(seq);
2285 
2286 	/* AF_UNSPEC is used as a match all */
2287 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2288 		net_eq(sock_net(sk), seq_file_net(seq)));
2289 }
2290 
2291 /* Find a non empty bucket (starting from st->bucket)
2292  * and return the first sk from it.
2293  */
2294 static void *listening_get_first(struct seq_file *seq)
2295 {
2296 	struct tcp_iter_state *st = seq->private;
2297 
2298 	st->offset = 0;
2299 	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2300 		struct inet_listen_hashbucket *ilb2;
2301 		struct inet_connection_sock *icsk;
2302 		struct sock *sk;
2303 
2304 		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2305 		if (hlist_empty(&ilb2->head))
2306 			continue;
2307 
2308 		spin_lock(&ilb2->lock);
2309 		inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
2310 			sk = (struct sock *)icsk;
2311 			if (seq_sk_match(seq, sk))
2312 				return sk;
2313 		}
2314 		spin_unlock(&ilb2->lock);
2315 	}
2316 
2317 	return NULL;
2318 }
2319 
2320 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2321  * If "cur" is the last one in the st->bucket,
2322  * call listening_get_first() to return the first sk of the next
2323  * non empty bucket.
2324  */
2325 static void *listening_get_next(struct seq_file *seq, void *cur)
2326 {
2327 	struct tcp_iter_state *st = seq->private;
2328 	struct inet_listen_hashbucket *ilb2;
2329 	struct inet_connection_sock *icsk;
2330 	struct sock *sk = cur;
2331 
2332 	++st->num;
2333 	++st->offset;
2334 
2335 	icsk = inet_csk(sk);
2336 	inet_lhash2_for_each_icsk_continue(icsk) {
2337 		sk = (struct sock *)icsk;
2338 		if (seq_sk_match(seq, sk))
2339 			return sk;
2340 	}
2341 
2342 	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2343 	spin_unlock(&ilb2->lock);
2344 	++st->bucket;
2345 	return listening_get_first(seq);
2346 }
2347 
2348 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2349 {
2350 	struct tcp_iter_state *st = seq->private;
2351 	void *rc;
2352 
2353 	st->bucket = 0;
2354 	st->offset = 0;
2355 	rc = listening_get_first(seq);
2356 
2357 	while (rc && *pos) {
2358 		rc = listening_get_next(seq, rc);
2359 		--*pos;
2360 	}
2361 	return rc;
2362 }
2363 
2364 static inline bool empty_bucket(const struct tcp_iter_state *st)
2365 {
2366 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2367 }
2368 
2369 /*
2370  * Get first established socket starting from bucket given in st->bucket.
2371  * If st->bucket is zero, the very first socket in the hash is returned.
2372  */
2373 static void *established_get_first(struct seq_file *seq)
2374 {
2375 	struct tcp_iter_state *st = seq->private;
2376 
2377 	st->offset = 0;
2378 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2379 		struct sock *sk;
2380 		struct hlist_nulls_node *node;
2381 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2382 
2383 		/* Lockless fast path for the common case of empty buckets */
2384 		if (empty_bucket(st))
2385 			continue;
2386 
2387 		spin_lock_bh(lock);
2388 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2389 			if (seq_sk_match(seq, sk))
2390 				return sk;
2391 		}
2392 		spin_unlock_bh(lock);
2393 	}
2394 
2395 	return NULL;
2396 }
2397 
2398 static void *established_get_next(struct seq_file *seq, void *cur)
2399 {
2400 	struct sock *sk = cur;
2401 	struct hlist_nulls_node *node;
2402 	struct tcp_iter_state *st = seq->private;
2403 
2404 	++st->num;
2405 	++st->offset;
2406 
2407 	sk = sk_nulls_next(sk);
2408 
2409 	sk_nulls_for_each_from(sk, node) {
2410 		if (seq_sk_match(seq, sk))
2411 			return sk;
2412 	}
2413 
2414 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2415 	++st->bucket;
2416 	return established_get_first(seq);
2417 }
2418 
2419 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2420 {
2421 	struct tcp_iter_state *st = seq->private;
2422 	void *rc;
2423 
2424 	st->bucket = 0;
2425 	rc = established_get_first(seq);
2426 
2427 	while (rc && pos) {
2428 		rc = established_get_next(seq, rc);
2429 		--pos;
2430 	}
2431 	return rc;
2432 }
2433 
2434 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2435 {
2436 	void *rc;
2437 	struct tcp_iter_state *st = seq->private;
2438 
2439 	st->state = TCP_SEQ_STATE_LISTENING;
2440 	rc	  = listening_get_idx(seq, &pos);
2441 
2442 	if (!rc) {
2443 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2444 		rc	  = established_get_idx(seq, pos);
2445 	}
2446 
2447 	return rc;
2448 }
2449 
2450 static void *tcp_seek_last_pos(struct seq_file *seq)
2451 {
2452 	struct tcp_iter_state *st = seq->private;
2453 	int bucket = st->bucket;
2454 	int offset = st->offset;
2455 	int orig_num = st->num;
2456 	void *rc = NULL;
2457 
2458 	switch (st->state) {
2459 	case TCP_SEQ_STATE_LISTENING:
2460 		if (st->bucket > tcp_hashinfo.lhash2_mask)
2461 			break;
2462 		st->state = TCP_SEQ_STATE_LISTENING;
2463 		rc = listening_get_first(seq);
2464 		while (offset-- && rc && bucket == st->bucket)
2465 			rc = listening_get_next(seq, rc);
2466 		if (rc)
2467 			break;
2468 		st->bucket = 0;
2469 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2470 		fallthrough;
2471 	case TCP_SEQ_STATE_ESTABLISHED:
2472 		if (st->bucket > tcp_hashinfo.ehash_mask)
2473 			break;
2474 		rc = established_get_first(seq);
2475 		while (offset-- && rc && bucket == st->bucket)
2476 			rc = established_get_next(seq, rc);
2477 	}
2478 
2479 	st->num = orig_num;
2480 
2481 	return rc;
2482 }
2483 
2484 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2485 {
2486 	struct tcp_iter_state *st = seq->private;
2487 	void *rc;
2488 
2489 	if (*pos && *pos == st->last_pos) {
2490 		rc = tcp_seek_last_pos(seq);
2491 		if (rc)
2492 			goto out;
2493 	}
2494 
2495 	st->state = TCP_SEQ_STATE_LISTENING;
2496 	st->num = 0;
2497 	st->bucket = 0;
2498 	st->offset = 0;
2499 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2500 
2501 out:
2502 	st->last_pos = *pos;
2503 	return rc;
2504 }
2505 EXPORT_SYMBOL(tcp_seq_start);
2506 
2507 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2508 {
2509 	struct tcp_iter_state *st = seq->private;
2510 	void *rc = NULL;
2511 
2512 	if (v == SEQ_START_TOKEN) {
2513 		rc = tcp_get_idx(seq, 0);
2514 		goto out;
2515 	}
2516 
2517 	switch (st->state) {
2518 	case TCP_SEQ_STATE_LISTENING:
2519 		rc = listening_get_next(seq, v);
2520 		if (!rc) {
2521 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2522 			st->bucket = 0;
2523 			st->offset = 0;
2524 			rc	  = established_get_first(seq);
2525 		}
2526 		break;
2527 	case TCP_SEQ_STATE_ESTABLISHED:
2528 		rc = established_get_next(seq, v);
2529 		break;
2530 	}
2531 out:
2532 	++*pos;
2533 	st->last_pos = *pos;
2534 	return rc;
2535 }
2536 EXPORT_SYMBOL(tcp_seq_next);
2537 
2538 void tcp_seq_stop(struct seq_file *seq, void *v)
2539 {
2540 	struct tcp_iter_state *st = seq->private;
2541 
2542 	switch (st->state) {
2543 	case TCP_SEQ_STATE_LISTENING:
2544 		if (v != SEQ_START_TOKEN)
2545 			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2546 		break;
2547 	case TCP_SEQ_STATE_ESTABLISHED:
2548 		if (v)
2549 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2550 		break;
2551 	}
2552 }
2553 EXPORT_SYMBOL(tcp_seq_stop);
2554 
2555 static void get_openreq4(const struct request_sock *req,
2556 			 struct seq_file *f, int i)
2557 {
2558 	const struct inet_request_sock *ireq = inet_rsk(req);
2559 	long delta = req->rsk_timer.expires - jiffies;
2560 
2561 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2562 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2563 		i,
2564 		ireq->ir_loc_addr,
2565 		ireq->ir_num,
2566 		ireq->ir_rmt_addr,
2567 		ntohs(ireq->ir_rmt_port),
2568 		TCP_SYN_RECV,
2569 		0, 0, /* could print option size, but that is af dependent. */
2570 		1,    /* timers active (only the expire timer) */
2571 		jiffies_delta_to_clock_t(delta),
2572 		req->num_timeout,
2573 		from_kuid_munged(seq_user_ns(f),
2574 				 sock_i_uid(req->rsk_listener)),
2575 		0,  /* non standard timer */
2576 		0, /* open_requests have no inode */
2577 		0,
2578 		req);
2579 }
2580 
2581 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2582 {
2583 	int timer_active;
2584 	unsigned long timer_expires;
2585 	const struct tcp_sock *tp = tcp_sk(sk);
2586 	const struct inet_connection_sock *icsk = inet_csk(sk);
2587 	const struct inet_sock *inet = inet_sk(sk);
2588 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2589 	__be32 dest = inet->inet_daddr;
2590 	__be32 src = inet->inet_rcv_saddr;
2591 	__u16 destp = ntohs(inet->inet_dport);
2592 	__u16 srcp = ntohs(inet->inet_sport);
2593 	int rx_queue;
2594 	int state;
2595 
2596 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2597 	    icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2598 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2599 		timer_active	= 1;
2600 		timer_expires	= icsk->icsk_timeout;
2601 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2602 		timer_active	= 4;
2603 		timer_expires	= icsk->icsk_timeout;
2604 	} else if (timer_pending(&sk->sk_timer)) {
2605 		timer_active	= 2;
2606 		timer_expires	= sk->sk_timer.expires;
2607 	} else {
2608 		timer_active	= 0;
2609 		timer_expires = jiffies;
2610 	}
2611 
2612 	state = inet_sk_state_load(sk);
2613 	if (state == TCP_LISTEN)
2614 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2615 	else
2616 		/* Because we don't lock the socket,
2617 		 * we might find a transient negative value.
2618 		 */
2619 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2620 				      READ_ONCE(tp->copied_seq), 0);
2621 
2622 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2623 			"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2624 		i, src, srcp, dest, destp, state,
2625 		READ_ONCE(tp->write_seq) - tp->snd_una,
2626 		rx_queue,
2627 		timer_active,
2628 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2629 		icsk->icsk_retransmits,
2630 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2631 		icsk->icsk_probes_out,
2632 		sock_i_ino(sk),
2633 		refcount_read(&sk->sk_refcnt), sk,
2634 		jiffies_to_clock_t(icsk->icsk_rto),
2635 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2636 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2637 		tp->snd_cwnd,
2638 		state == TCP_LISTEN ?
2639 		    fastopenq->max_qlen :
2640 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2641 }
2642 
2643 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2644 			       struct seq_file *f, int i)
2645 {
2646 	long delta = tw->tw_timer.expires - jiffies;
2647 	__be32 dest, src;
2648 	__u16 destp, srcp;
2649 
2650 	dest  = tw->tw_daddr;
2651 	src   = tw->tw_rcv_saddr;
2652 	destp = ntohs(tw->tw_dport);
2653 	srcp  = ntohs(tw->tw_sport);
2654 
2655 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2656 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2657 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2658 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2659 		refcount_read(&tw->tw_refcnt), tw);
2660 }
2661 
2662 #define TMPSZ 150
2663 
2664 static int tcp4_seq_show(struct seq_file *seq, void *v)
2665 {
2666 	struct tcp_iter_state *st;
2667 	struct sock *sk = v;
2668 
2669 	seq_setwidth(seq, TMPSZ - 1);
2670 	if (v == SEQ_START_TOKEN) {
2671 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2672 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2673 			   "inode");
2674 		goto out;
2675 	}
2676 	st = seq->private;
2677 
2678 	if (sk->sk_state == TCP_TIME_WAIT)
2679 		get_timewait4_sock(v, seq, st->num);
2680 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2681 		get_openreq4(v, seq, st->num);
2682 	else
2683 		get_tcp4_sock(v, seq, st->num);
2684 out:
2685 	seq_pad(seq, '\n');
2686 	return 0;
2687 }
2688 
2689 #ifdef CONFIG_BPF_SYSCALL
2690 struct bpf_tcp_iter_state {
2691 	struct tcp_iter_state state;
2692 	unsigned int cur_sk;
2693 	unsigned int end_sk;
2694 	unsigned int max_sk;
2695 	struct sock **batch;
2696 	bool st_bucket_done;
2697 };
2698 
2699 struct bpf_iter__tcp {
2700 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2701 	__bpf_md_ptr(struct sock_common *, sk_common);
2702 	uid_t uid __aligned(8);
2703 };
2704 
2705 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2706 			     struct sock_common *sk_common, uid_t uid)
2707 {
2708 	struct bpf_iter__tcp ctx;
2709 
2710 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2711 	ctx.meta = meta;
2712 	ctx.sk_common = sk_common;
2713 	ctx.uid = uid;
2714 	return bpf_iter_run_prog(prog, &ctx);
2715 }
2716 
2717 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2718 {
2719 	while (iter->cur_sk < iter->end_sk)
2720 		sock_put(iter->batch[iter->cur_sk++]);
2721 }
2722 
2723 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2724 				      unsigned int new_batch_sz)
2725 {
2726 	struct sock **new_batch;
2727 
2728 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2729 			     GFP_USER | __GFP_NOWARN);
2730 	if (!new_batch)
2731 		return -ENOMEM;
2732 
2733 	bpf_iter_tcp_put_batch(iter);
2734 	kvfree(iter->batch);
2735 	iter->batch = new_batch;
2736 	iter->max_sk = new_batch_sz;
2737 
2738 	return 0;
2739 }
2740 
2741 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2742 						 struct sock *start_sk)
2743 {
2744 	struct bpf_tcp_iter_state *iter = seq->private;
2745 	struct tcp_iter_state *st = &iter->state;
2746 	struct inet_connection_sock *icsk;
2747 	unsigned int expected = 1;
2748 	struct sock *sk;
2749 
2750 	sock_hold(start_sk);
2751 	iter->batch[iter->end_sk++] = start_sk;
2752 
2753 	icsk = inet_csk(start_sk);
2754 	inet_lhash2_for_each_icsk_continue(icsk) {
2755 		sk = (struct sock *)icsk;
2756 		if (seq_sk_match(seq, sk)) {
2757 			if (iter->end_sk < iter->max_sk) {
2758 				sock_hold(sk);
2759 				iter->batch[iter->end_sk++] = sk;
2760 			}
2761 			expected++;
2762 		}
2763 	}
2764 	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2765 
2766 	return expected;
2767 }
2768 
2769 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2770 						   struct sock *start_sk)
2771 {
2772 	struct bpf_tcp_iter_state *iter = seq->private;
2773 	struct tcp_iter_state *st = &iter->state;
2774 	struct hlist_nulls_node *node;
2775 	unsigned int expected = 1;
2776 	struct sock *sk;
2777 
2778 	sock_hold(start_sk);
2779 	iter->batch[iter->end_sk++] = start_sk;
2780 
2781 	sk = sk_nulls_next(start_sk);
2782 	sk_nulls_for_each_from(sk, node) {
2783 		if (seq_sk_match(seq, sk)) {
2784 			if (iter->end_sk < iter->max_sk) {
2785 				sock_hold(sk);
2786 				iter->batch[iter->end_sk++] = sk;
2787 			}
2788 			expected++;
2789 		}
2790 	}
2791 	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2792 
2793 	return expected;
2794 }
2795 
2796 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2797 {
2798 	struct bpf_tcp_iter_state *iter = seq->private;
2799 	struct tcp_iter_state *st = &iter->state;
2800 	unsigned int expected;
2801 	bool resized = false;
2802 	struct sock *sk;
2803 
2804 	/* The st->bucket is done.  Directly advance to the next
2805 	 * bucket instead of having the tcp_seek_last_pos() to skip
2806 	 * one by one in the current bucket and eventually find out
2807 	 * it has to advance to the next bucket.
2808 	 */
2809 	if (iter->st_bucket_done) {
2810 		st->offset = 0;
2811 		st->bucket++;
2812 		if (st->state == TCP_SEQ_STATE_LISTENING &&
2813 		    st->bucket > tcp_hashinfo.lhash2_mask) {
2814 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2815 			st->bucket = 0;
2816 		}
2817 	}
2818 
2819 again:
2820 	/* Get a new batch */
2821 	iter->cur_sk = 0;
2822 	iter->end_sk = 0;
2823 	iter->st_bucket_done = false;
2824 
2825 	sk = tcp_seek_last_pos(seq);
2826 	if (!sk)
2827 		return NULL; /* Done */
2828 
2829 	if (st->state == TCP_SEQ_STATE_LISTENING)
2830 		expected = bpf_iter_tcp_listening_batch(seq, sk);
2831 	else
2832 		expected = bpf_iter_tcp_established_batch(seq, sk);
2833 
2834 	if (iter->end_sk == expected) {
2835 		iter->st_bucket_done = true;
2836 		return sk;
2837 	}
2838 
2839 	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2840 		resized = true;
2841 		goto again;
2842 	}
2843 
2844 	return sk;
2845 }
2846 
2847 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2848 {
2849 	/* bpf iter does not support lseek, so it always
2850 	 * continue from where it was stop()-ped.
2851 	 */
2852 	if (*pos)
2853 		return bpf_iter_tcp_batch(seq);
2854 
2855 	return SEQ_START_TOKEN;
2856 }
2857 
2858 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2859 {
2860 	struct bpf_tcp_iter_state *iter = seq->private;
2861 	struct tcp_iter_state *st = &iter->state;
2862 	struct sock *sk;
2863 
2864 	/* Whenever seq_next() is called, the iter->cur_sk is
2865 	 * done with seq_show(), so advance to the next sk in
2866 	 * the batch.
2867 	 */
2868 	if (iter->cur_sk < iter->end_sk) {
2869 		/* Keeping st->num consistent in tcp_iter_state.
2870 		 * bpf_iter_tcp does not use st->num.
2871 		 * meta.seq_num is used instead.
2872 		 */
2873 		st->num++;
2874 		/* Move st->offset to the next sk in the bucket such that
2875 		 * the future start() will resume at st->offset in
2876 		 * st->bucket.  See tcp_seek_last_pos().
2877 		 */
2878 		st->offset++;
2879 		sock_put(iter->batch[iter->cur_sk++]);
2880 	}
2881 
2882 	if (iter->cur_sk < iter->end_sk)
2883 		sk = iter->batch[iter->cur_sk];
2884 	else
2885 		sk = bpf_iter_tcp_batch(seq);
2886 
2887 	++*pos;
2888 	/* Keeping st->last_pos consistent in tcp_iter_state.
2889 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2890 	 */
2891 	st->last_pos = *pos;
2892 	return sk;
2893 }
2894 
2895 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2896 {
2897 	struct bpf_iter_meta meta;
2898 	struct bpf_prog *prog;
2899 	struct sock *sk = v;
2900 	bool slow;
2901 	uid_t uid;
2902 	int ret;
2903 
2904 	if (v == SEQ_START_TOKEN)
2905 		return 0;
2906 
2907 	if (sk_fullsock(sk))
2908 		slow = lock_sock_fast(sk);
2909 
2910 	if (unlikely(sk_unhashed(sk))) {
2911 		ret = SEQ_SKIP;
2912 		goto unlock;
2913 	}
2914 
2915 	if (sk->sk_state == TCP_TIME_WAIT) {
2916 		uid = 0;
2917 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2918 		const struct request_sock *req = v;
2919 
2920 		uid = from_kuid_munged(seq_user_ns(seq),
2921 				       sock_i_uid(req->rsk_listener));
2922 	} else {
2923 		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2924 	}
2925 
2926 	meta.seq = seq;
2927 	prog = bpf_iter_get_info(&meta, false);
2928 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
2929 
2930 unlock:
2931 	if (sk_fullsock(sk))
2932 		unlock_sock_fast(sk, slow);
2933 	return ret;
2934 
2935 }
2936 
2937 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2938 {
2939 	struct bpf_tcp_iter_state *iter = seq->private;
2940 	struct bpf_iter_meta meta;
2941 	struct bpf_prog *prog;
2942 
2943 	if (!v) {
2944 		meta.seq = seq;
2945 		prog = bpf_iter_get_info(&meta, true);
2946 		if (prog)
2947 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
2948 	}
2949 
2950 	if (iter->cur_sk < iter->end_sk) {
2951 		bpf_iter_tcp_put_batch(iter);
2952 		iter->st_bucket_done = false;
2953 	}
2954 }
2955 
2956 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2957 	.show		= bpf_iter_tcp_seq_show,
2958 	.start		= bpf_iter_tcp_seq_start,
2959 	.next		= bpf_iter_tcp_seq_next,
2960 	.stop		= bpf_iter_tcp_seq_stop,
2961 };
2962 #endif
2963 static unsigned short seq_file_family(const struct seq_file *seq)
2964 {
2965 	const struct tcp_seq_afinfo *afinfo;
2966 
2967 #ifdef CONFIG_BPF_SYSCALL
2968 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2969 	if (seq->op == &bpf_iter_tcp_seq_ops)
2970 		return AF_UNSPEC;
2971 #endif
2972 
2973 	/* Iterated from proc fs */
2974 	afinfo = PDE_DATA(file_inode(seq->file));
2975 	return afinfo->family;
2976 }
2977 
2978 static const struct seq_operations tcp4_seq_ops = {
2979 	.show		= tcp4_seq_show,
2980 	.start		= tcp_seq_start,
2981 	.next		= tcp_seq_next,
2982 	.stop		= tcp_seq_stop,
2983 };
2984 
2985 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2986 	.family		= AF_INET,
2987 };
2988 
2989 static int __net_init tcp4_proc_init_net(struct net *net)
2990 {
2991 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2992 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2993 		return -ENOMEM;
2994 	return 0;
2995 }
2996 
2997 static void __net_exit tcp4_proc_exit_net(struct net *net)
2998 {
2999 	remove_proc_entry("tcp", net->proc_net);
3000 }
3001 
3002 static struct pernet_operations tcp4_net_ops = {
3003 	.init = tcp4_proc_init_net,
3004 	.exit = tcp4_proc_exit_net,
3005 };
3006 
3007 int __init tcp4_proc_init(void)
3008 {
3009 	return register_pernet_subsys(&tcp4_net_ops);
3010 }
3011 
3012 void tcp4_proc_exit(void)
3013 {
3014 	unregister_pernet_subsys(&tcp4_net_ops);
3015 }
3016 #endif /* CONFIG_PROC_FS */
3017 
3018 /* @wake is one when sk_stream_write_space() calls us.
3019  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3020  * This mimics the strategy used in sock_def_write_space().
3021  */
3022 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3023 {
3024 	const struct tcp_sock *tp = tcp_sk(sk);
3025 	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3026 			    READ_ONCE(tp->snd_nxt);
3027 
3028 	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3029 }
3030 EXPORT_SYMBOL(tcp_stream_memory_free);
3031 
3032 struct proto tcp_prot = {
3033 	.name			= "TCP",
3034 	.owner			= THIS_MODULE,
3035 	.close			= tcp_close,
3036 	.pre_connect		= tcp_v4_pre_connect,
3037 	.connect		= tcp_v4_connect,
3038 	.disconnect		= tcp_disconnect,
3039 	.accept			= inet_csk_accept,
3040 	.ioctl			= tcp_ioctl,
3041 	.init			= tcp_v4_init_sock,
3042 	.destroy		= tcp_v4_destroy_sock,
3043 	.shutdown		= tcp_shutdown,
3044 	.setsockopt		= tcp_setsockopt,
3045 	.getsockopt		= tcp_getsockopt,
3046 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3047 	.keepalive		= tcp_set_keepalive,
3048 	.recvmsg		= tcp_recvmsg,
3049 	.sendmsg		= tcp_sendmsg,
3050 	.sendpage		= tcp_sendpage,
3051 	.backlog_rcv		= tcp_v4_do_rcv,
3052 	.release_cb		= tcp_release_cb,
3053 	.hash			= inet_hash,
3054 	.unhash			= inet_unhash,
3055 	.get_port		= inet_csk_get_port,
3056 #ifdef CONFIG_BPF_SYSCALL
3057 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3058 #endif
3059 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3060 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3061 	.stream_memory_free	= tcp_stream_memory_free,
3062 	.sockets_allocated	= &tcp_sockets_allocated,
3063 	.orphan_count		= &tcp_orphan_count,
3064 	.memory_allocated	= &tcp_memory_allocated,
3065 	.memory_pressure	= &tcp_memory_pressure,
3066 	.sysctl_mem		= sysctl_tcp_mem,
3067 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3068 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3069 	.max_header		= MAX_TCP_HEADER,
3070 	.obj_size		= sizeof(struct tcp_sock),
3071 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3072 	.twsk_prot		= &tcp_timewait_sock_ops,
3073 	.rsk_prot		= &tcp_request_sock_ops,
3074 	.h.hashinfo		= &tcp_hashinfo,
3075 	.no_autobind		= true,
3076 	.diag_destroy		= tcp_abort,
3077 };
3078 EXPORT_SYMBOL(tcp_prot);
3079 
3080 static void __net_exit tcp_sk_exit(struct net *net)
3081 {
3082 	int cpu;
3083 
3084 	if (net->ipv4.tcp_congestion_control)
3085 		bpf_module_put(net->ipv4.tcp_congestion_control,
3086 			       net->ipv4.tcp_congestion_control->owner);
3087 
3088 	for_each_possible_cpu(cpu)
3089 		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
3090 	free_percpu(net->ipv4.tcp_sk);
3091 }
3092 
3093 static int __net_init tcp_sk_init(struct net *net)
3094 {
3095 	int res, cpu, cnt;
3096 
3097 	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
3098 	if (!net->ipv4.tcp_sk)
3099 		return -ENOMEM;
3100 
3101 	for_each_possible_cpu(cpu) {
3102 		struct sock *sk;
3103 
3104 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3105 					   IPPROTO_TCP, net);
3106 		if (res)
3107 			goto fail;
3108 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3109 
3110 		/* Please enforce IP_DF and IPID==0 for RST and
3111 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3112 		 */
3113 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3114 
3115 		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
3116 	}
3117 
3118 	net->ipv4.sysctl_tcp_ecn = 2;
3119 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3120 
3121 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3122 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3123 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3124 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3125 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3126 
3127 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3128 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3129 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3130 
3131 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3132 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3133 	net->ipv4.sysctl_tcp_syncookies = 1;
3134 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3135 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3136 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3137 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3138 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3139 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3140 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3141 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3142 
3143 	cnt = tcp_hashinfo.ehash_mask + 1;
3144 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3145 	net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
3146 
3147 	net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3148 	net->ipv4.sysctl_tcp_sack = 1;
3149 	net->ipv4.sysctl_tcp_window_scaling = 1;
3150 	net->ipv4.sysctl_tcp_timestamps = 1;
3151 	net->ipv4.sysctl_tcp_early_retrans = 3;
3152 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3153 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3154 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3155 	net->ipv4.sysctl_tcp_max_reordering = 300;
3156 	net->ipv4.sysctl_tcp_dsack = 1;
3157 	net->ipv4.sysctl_tcp_app_win = 31;
3158 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3159 	net->ipv4.sysctl_tcp_frto = 2;
3160 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3161 	/* This limits the percentage of the congestion window which we
3162 	 * will allow a single TSO frame to consume.  Building TSO frames
3163 	 * which are too large can cause TCP streams to be bursty.
3164 	 */
3165 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3166 	/* Default TSQ limit of 16 TSO segments */
3167 	net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3168 	/* rfc5961 challenge ack rate limiting */
3169 	net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3170 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3171 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3172 	net->ipv4.sysctl_tcp_autocorking = 1;
3173 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3174 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3175 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3176 	if (net != &init_net) {
3177 		memcpy(net->ipv4.sysctl_tcp_rmem,
3178 		       init_net.ipv4.sysctl_tcp_rmem,
3179 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3180 		memcpy(net->ipv4.sysctl_tcp_wmem,
3181 		       init_net.ipv4.sysctl_tcp_wmem,
3182 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3183 	}
3184 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3185 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3186 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3187 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3188 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3189 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3190 
3191 	/* Reno is always built in */
3192 	if (!net_eq(net, &init_net) &&
3193 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3194 			       init_net.ipv4.tcp_congestion_control->owner))
3195 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3196 	else
3197 		net->ipv4.tcp_congestion_control = &tcp_reno;
3198 
3199 	return 0;
3200 fail:
3201 	tcp_sk_exit(net);
3202 
3203 	return res;
3204 }
3205 
3206 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3207 {
3208 	struct net *net;
3209 
3210 	inet_twsk_purge(&tcp_hashinfo, AF_INET);
3211 
3212 	list_for_each_entry(net, net_exit_list, exit_list)
3213 		tcp_fastopen_ctx_destroy(net);
3214 }
3215 
3216 static struct pernet_operations __net_initdata tcp_sk_ops = {
3217        .init	   = tcp_sk_init,
3218        .exit	   = tcp_sk_exit,
3219        .exit_batch = tcp_sk_exit_batch,
3220 };
3221 
3222 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3223 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3224 		     struct sock_common *sk_common, uid_t uid)
3225 
3226 #define INIT_BATCH_SZ 16
3227 
3228 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3229 {
3230 	struct bpf_tcp_iter_state *iter = priv_data;
3231 	int err;
3232 
3233 	err = bpf_iter_init_seq_net(priv_data, aux);
3234 	if (err)
3235 		return err;
3236 
3237 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3238 	if (err) {
3239 		bpf_iter_fini_seq_net(priv_data);
3240 		return err;
3241 	}
3242 
3243 	return 0;
3244 }
3245 
3246 static void bpf_iter_fini_tcp(void *priv_data)
3247 {
3248 	struct bpf_tcp_iter_state *iter = priv_data;
3249 
3250 	bpf_iter_fini_seq_net(priv_data);
3251 	kvfree(iter->batch);
3252 }
3253 
3254 static const struct bpf_iter_seq_info tcp_seq_info = {
3255 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3256 	.init_seq_private	= bpf_iter_init_tcp,
3257 	.fini_seq_private	= bpf_iter_fini_tcp,
3258 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3259 };
3260 
3261 static const struct bpf_func_proto *
3262 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3263 			    const struct bpf_prog *prog)
3264 {
3265 	switch (func_id) {
3266 	case BPF_FUNC_setsockopt:
3267 		return &bpf_sk_setsockopt_proto;
3268 	case BPF_FUNC_getsockopt:
3269 		return &bpf_sk_getsockopt_proto;
3270 	default:
3271 		return NULL;
3272 	}
3273 }
3274 
3275 static struct bpf_iter_reg tcp_reg_info = {
3276 	.target			= "tcp",
3277 	.ctx_arg_info_size	= 1,
3278 	.ctx_arg_info		= {
3279 		{ offsetof(struct bpf_iter__tcp, sk_common),
3280 		  PTR_TO_BTF_ID_OR_NULL },
3281 	},
3282 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3283 	.seq_info		= &tcp_seq_info,
3284 };
3285 
3286 static void __init bpf_iter_register(void)
3287 {
3288 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3289 	if (bpf_iter_reg_target(&tcp_reg_info))
3290 		pr_warn("Warning: could not register bpf iterator tcp\n");
3291 }
3292 
3293 #endif
3294 
3295 void __init tcp_v4_init(void)
3296 {
3297 	if (register_pernet_subsys(&tcp_sk_ops))
3298 		panic("Failed to create the TCP control socket.\n");
3299 
3300 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3301 	bpf_iter_register();
3302 #endif
3303 }
3304