1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/fips.h>
57 #include <linux/jhash.h>
58 #include <linux/init.h>
59 #include <linux/times.h>
60 #include <linux/slab.h>
61 #include <linux/sched.h>
62 #include <linux/sock_diag.h>
63
64 #include <net/aligned_data.h>
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/tcp_ecn.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/inet_ecn.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/busy_poll.h>
78 #include <net/rstreason.h>
79 #include <net/psp.h>
80
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86 #include <linux/inetdevice.h>
87 #include <linux/btf_ids.h>
88 #include <linux/skbuff_ref.h>
89
90 #include <crypto/md5.h>
91 #include <crypto/utils.h>
92
93 #include <trace/events/tcp.h>
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
97 __be32 daddr, __be32 saddr, const struct tcphdr *th);
98 #endif
99
100 struct inet_hashinfo tcp_hashinfo;
101
102 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
103 .bh_lock = INIT_LOCAL_LOCK(bh_lock),
104 };
105
106 static DEFINE_MUTEX(tcp_exit_batch_mutex);
107
108 INDIRECT_CALLABLE_SCOPE union tcp_seq_and_ts_off
tcp_v4_init_seq_and_ts_off(const struct net * net,const struct sk_buff * skb)109 tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
110 {
111 return secure_tcp_seq_and_ts_off(net,
112 ip_hdr(skb)->daddr,
113 ip_hdr(skb)->saddr,
114 tcp_hdr(skb)->dest,
115 tcp_hdr(skb)->source);
116 }
117
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
119 {
120 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
121 const struct inet_timewait_sock *tw = inet_twsk(sktw);
122 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
123 struct tcp_sock *tp = tcp_sk(sk);
124 int ts_recent_stamp;
125 u32 reuse_thresh;
126
127 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
128 reuse = 0;
129
130 if (reuse == 2) {
131 /* Still does not detect *everything* that goes through
132 * lo, since we require a loopback src or dst address
133 * or direct binding to 'lo' interface.
134 */
135 bool loopback = false;
136 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
137 loopback = true;
138 #if IS_ENABLED(CONFIG_IPV6)
139 if (tw->tw_family == AF_INET6) {
140 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
142 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
144 loopback = true;
145 } else
146 #endif
147 {
148 if (ipv4_is_loopback(tw->tw_daddr) ||
149 ipv4_is_loopback(tw->tw_rcv_saddr))
150 loopback = true;
151 }
152 if (!loopback)
153 reuse = 0;
154 }
155
156 /* With PAWS, it is safe from the viewpoint
157 of data integrity. Even without PAWS it is safe provided sequence
158 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
159
160 Actually, the idea is close to VJ's one, only timestamp cache is
161 held not per host, but per port pair and TW bucket is used as state
162 holder.
163
164 If TW bucket has been already destroyed we fall back to VJ's scheme
165 and use initial timestamp retrieved from peer table.
166 */
167 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
168 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
169 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
170 if (ts_recent_stamp &&
171 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
172 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
173 * and releasing the bucket lock.
174 */
175 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
176 return 0;
177
178 /* In case of repair and re-using TIME-WAIT sockets we still
179 * want to be sure that it is safe as above but honor the
180 * sequence numbers and time stamps set as part of the repair
181 * process.
182 *
183 * Without this check re-using a TIME-WAIT socket with TCP
184 * repair would accumulate a -1 on the repair assigned
185 * sequence number. The first time it is reused the sequence
186 * is -1, the second time -2, etc. This fixes that issue
187 * without appearing to create any others.
188 */
189 if (likely(!tp->repair)) {
190 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
191
192 if (!seq)
193 seq = 1;
194 WRITE_ONCE(tp->write_seq, seq);
195 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
196 tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
197 }
198
199 return 1;
200 }
201
202 return 0;
203 }
204
tcp_v4_pre_connect(struct sock * sk,struct sockaddr_unsized * uaddr,int addr_len)205 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
206 int addr_len)
207 {
208 /* This check is replicated from tcp_v4_connect() and intended to
209 * prevent BPF program called below from accessing bytes that are out
210 * of the bound specified by user in addr_len.
211 */
212 if (addr_len < sizeof(struct sockaddr_in))
213 return -EINVAL;
214
215 sock_owned_by_me(sk);
216
217 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
218 }
219
220 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr_unsized * uaddr,int addr_len)221 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
222 {
223 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
224 struct inet_timewait_death_row *tcp_death_row;
225 struct inet_sock *inet = inet_sk(sk);
226 struct tcp_sock *tp = tcp_sk(sk);
227 struct ip_options_rcu *inet_opt;
228 struct net *net = sock_net(sk);
229 __be16 orig_sport, orig_dport;
230 __be32 daddr, nexthop;
231 struct flowi4 *fl4;
232 struct rtable *rt;
233 int err;
234
235 if (addr_len < sizeof(struct sockaddr_in))
236 return -EINVAL;
237
238 if (usin->sin_family != AF_INET)
239 return -EAFNOSUPPORT;
240
241 nexthop = daddr = usin->sin_addr.s_addr;
242 inet_opt = rcu_dereference_protected(inet->inet_opt,
243 lockdep_sock_is_held(sk));
244 if (inet_opt && inet_opt->opt.srr) {
245 if (!daddr)
246 return -EINVAL;
247 nexthop = inet_opt->opt.faddr;
248 }
249
250 orig_sport = inet->inet_sport;
251 orig_dport = usin->sin_port;
252 fl4 = &inet->cork.fl.u.ip4;
253 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
254 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
255 orig_dport, sk);
256 if (IS_ERR(rt)) {
257 err = PTR_ERR(rt);
258 if (err == -ENETUNREACH)
259 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
260 return err;
261 }
262
263 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
264 ip_rt_put(rt);
265 return -ENETUNREACH;
266 }
267
268 if (!inet_opt || !inet_opt->opt.srr)
269 daddr = fl4->daddr;
270
271 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
272
273 if (!inet->inet_saddr) {
274 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
275 if (err) {
276 ip_rt_put(rt);
277 return err;
278 }
279 } else {
280 sk_rcv_saddr_set(sk, inet->inet_saddr);
281 }
282
283 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
284 /* Reset inherited state */
285 tp->rx_opt.ts_recent = 0;
286 tp->rx_opt.ts_recent_stamp = 0;
287 if (likely(!tp->repair))
288 WRITE_ONCE(tp->write_seq, 0);
289 }
290
291 inet->inet_dport = usin->sin_port;
292 sk_daddr_set(sk, daddr);
293
294 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
295 if (inet_opt)
296 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
297
298 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
299
300 /* Socket identity is still unknown (sport may be zero).
301 * However we set state to SYN-SENT and not releasing socket
302 * lock select source port, enter ourselves into the hash tables and
303 * complete initialization after this.
304 */
305 tcp_set_state(sk, TCP_SYN_SENT);
306 err = inet_hash_connect(tcp_death_row, sk);
307 if (err)
308 goto failure;
309
310 sk_set_txhash(sk);
311
312 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
313 inet->inet_sport, inet->inet_dport, sk);
314 if (IS_ERR(rt)) {
315 err = PTR_ERR(rt);
316 rt = NULL;
317 goto failure;
318 }
319 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
320 /* OK, now commit destination to socket. */
321 sk->sk_gso_type = SKB_GSO_TCPV4;
322 sk_setup_caps(sk, &rt->dst);
323 rt = NULL;
324
325 if (likely(!tp->repair)) {
326 union tcp_seq_and_ts_off st;
327
328 st = secure_tcp_seq_and_ts_off(net,
329 inet->inet_saddr,
330 inet->inet_daddr,
331 inet->inet_sport,
332 usin->sin_port);
333 if (!tp->write_seq)
334 WRITE_ONCE(tp->write_seq, st.seq);
335 WRITE_ONCE(tp->tsoffset, st.ts_off);
336 }
337
338 atomic_set(&inet->inet_id, get_random_u16());
339
340 if (tcp_fastopen_defer_connect(sk, &err))
341 return err;
342 if (err)
343 goto failure;
344
345 err = tcp_connect(sk);
346
347 if (err)
348 goto failure;
349
350 return 0;
351
352 failure:
353 /*
354 * This unhashes the socket and releases the local port,
355 * if necessary.
356 */
357 tcp_set_state(sk, TCP_CLOSE);
358 inet_bhash2_reset_saddr(sk);
359 ip_rt_put(rt);
360 sk->sk_route_caps = 0;
361 inet->inet_dport = 0;
362 return err;
363 }
364
365 /*
366 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
367 * It can be called through tcp_release_cb() if socket was owned by user
368 * at the time tcp_v4_err() was called to handle ICMP message.
369 */
tcp_v4_mtu_reduced(struct sock * sk)370 void tcp_v4_mtu_reduced(struct sock *sk)
371 {
372 struct inet_sock *inet = inet_sk(sk);
373 struct dst_entry *dst;
374 u32 mtu, dmtu;
375
376 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
377 return;
378 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
379 dst = inet_csk_update_pmtu(sk, mtu);
380 if (!dst)
381 return;
382
383 /* Something is about to be wrong... Remember soft error
384 * for the case, if this connection will not able to recover.
385 */
386 dmtu = dst4_mtu(dst);
387 if (mtu < dmtu && ip_dont_fragment(sk, dst))
388 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
389
390 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
391 ip_sk_accept_pmtu(sk) &&
392 inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
393 tcp_sync_mss(sk, dmtu);
394
395 /* Resend the TCP packet because it's
396 * clear that the old packet has been
397 * dropped. This is the new "fast" path mtu
398 * discovery.
399 */
400 tcp_simple_retransmit(sk);
401 } /* else let the usual retransmit timer handle it */
402 }
403
do_redirect(struct sk_buff * skb,struct sock * sk)404 static void do_redirect(struct sk_buff *skb, struct sock *sk)
405 {
406 struct dst_entry *dst = __sk_dst_check(sk, 0);
407
408 if (dst)
409 dst->ops->redirect(dst, sk, skb);
410 }
411
412
413 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)414 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
415 {
416 struct request_sock *req = inet_reqsk(sk);
417 struct net *net = sock_net(sk);
418
419 /* ICMPs are not backlogged, hence we cannot get
420 * an established socket here.
421 */
422 if (seq != tcp_rsk(req)->snt_isn) {
423 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
424 } else if (abort) {
425 /*
426 * Still in SYN_RECV, just remove it silently.
427 * There is no good way to pass the error to the newly
428 * created socket, and POSIX does not want network
429 * errors returned from accept().
430 */
431 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
432 tcp_listendrop(req->rsk_listener);
433 }
434 reqsk_put(req);
435 }
436
437 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)438 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
439 {
440 struct inet_connection_sock *icsk = inet_csk(sk);
441 struct tcp_sock *tp = tcp_sk(sk);
442 struct sk_buff *skb;
443 s32 remaining;
444 u32 delta_us;
445
446 if (sock_owned_by_user(sk))
447 return;
448
449 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
450 !icsk->icsk_backoff)
451 return;
452
453 skb = tcp_rtx_queue_head(sk);
454 if (WARN_ON_ONCE(!skb))
455 return;
456
457 icsk->icsk_backoff--;
458 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
459 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
460
461 tcp_mstamp_refresh(tp);
462 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
463 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
464
465 if (remaining > 0) {
466 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
467 } else {
468 /* RTO revert clocked out retransmission.
469 * Will retransmit now.
470 */
471 tcp_retransmit_timer(sk);
472 }
473 }
474
475 /*
476 * This routine is called by the ICMP module when it gets some
477 * sort of error condition. If err < 0 then the socket should
478 * be closed and the error returned to the user. If err > 0
479 * it's just the icmp type << 8 | icmp code. After adjustment
480 * header points to the first 8 bytes of the tcp header. We need
481 * to find the appropriate port.
482 *
483 * The locking strategy used here is very "optimistic". When
484 * someone else accesses the socket the ICMP is just dropped
485 * and for some paths there is no check at all.
486 * A more general error queue to queue errors for later handling
487 * is probably better.
488 *
489 */
490
tcp_v4_err(struct sk_buff * skb,u32 info)491 int tcp_v4_err(struct sk_buff *skb, u32 info)
492 {
493 const struct iphdr *iph = (const struct iphdr *)skb->data;
494 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
495 struct net *net = dev_net_rcu(skb->dev);
496 const int type = icmp_hdr(skb)->type;
497 const int code = icmp_hdr(skb)->code;
498 struct request_sock *fastopen;
499 struct tcp_sock *tp;
500 u32 seq, snd_una;
501 struct sock *sk;
502 int err;
503
504 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
505 ntohs(th->source), inet_iif(skb), 0);
506 if (!sk) {
507 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
508 return -ENOENT;
509 }
510 if (sk->sk_state == TCP_TIME_WAIT) {
511 /* To increase the counter of ignored icmps for TCP-AO */
512 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
513 inet_twsk_put(inet_twsk(sk));
514 return 0;
515 }
516 seq = ntohl(th->seq);
517 if (sk->sk_state == TCP_NEW_SYN_RECV) {
518 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
519 type == ICMP_TIME_EXCEEDED ||
520 (type == ICMP_DEST_UNREACH &&
521 (code == ICMP_NET_UNREACH ||
522 code == ICMP_HOST_UNREACH)));
523 return 0;
524 }
525
526 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
527 sock_put(sk);
528 return 0;
529 }
530
531 bh_lock_sock(sk);
532 /* If too many ICMPs get dropped on busy
533 * servers this needs to be solved differently.
534 * We do take care of PMTU discovery (RFC1191) special case :
535 * we can receive locally generated ICMP messages while socket is held.
536 */
537 if (sock_owned_by_user(sk)) {
538 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
539 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
540 }
541 if (sk->sk_state == TCP_CLOSE)
542 goto out;
543
544 if (static_branch_unlikely(&ip4_min_ttl)) {
545 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
546 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
547 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
548 goto out;
549 }
550 }
551
552 tp = tcp_sk(sk);
553 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
554 fastopen = rcu_dereference(tp->fastopen_rsk);
555 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
556 if (sk->sk_state != TCP_LISTEN &&
557 !between(seq, snd_una, tp->snd_nxt)) {
558 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
559 goto out;
560 }
561
562 switch (type) {
563 case ICMP_REDIRECT:
564 if (!sock_owned_by_user(sk))
565 do_redirect(skb, sk);
566 goto out;
567 case ICMP_SOURCE_QUENCH:
568 /* Just silently ignore these. */
569 goto out;
570 case ICMP_PARAMETERPROB:
571 err = EPROTO;
572 break;
573 case ICMP_DEST_UNREACH:
574 if (code > NR_ICMP_UNREACH)
575 goto out;
576
577 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
578 /* We are not interested in TCP_LISTEN and open_requests
579 * (SYN-ACKs send out by Linux are always <576bytes so
580 * they should go through unfragmented).
581 */
582 if (sk->sk_state == TCP_LISTEN)
583 goto out;
584
585 WRITE_ONCE(tp->mtu_info, info);
586 if (!sock_owned_by_user(sk)) {
587 tcp_v4_mtu_reduced(sk);
588 } else {
589 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
590 sock_hold(sk);
591 }
592 goto out;
593 }
594
595 err = icmp_err_convert[code].errno;
596 /* check if this ICMP message allows revert of backoff.
597 * (see RFC 6069)
598 */
599 if (!fastopen &&
600 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
601 tcp_ld_RTO_revert(sk, seq);
602 break;
603 case ICMP_TIME_EXCEEDED:
604 err = EHOSTUNREACH;
605 break;
606 default:
607 goto out;
608 }
609
610 switch (sk->sk_state) {
611 case TCP_SYN_SENT:
612 case TCP_SYN_RECV:
613 /* Only in fast or simultaneous open. If a fast open socket is
614 * already accepted it is treated as a connected one below.
615 */
616 if (fastopen && !fastopen->sk)
617 break;
618
619 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
620
621 if (!sock_owned_by_user(sk))
622 tcp_done_with_error(sk, err);
623 else
624 WRITE_ONCE(sk->sk_err_soft, err);
625 goto out;
626 }
627
628 /* If we've already connected we will keep trying
629 * until we time out, or the user gives up.
630 *
631 * rfc1122 4.2.3.9 allows to consider as hard errors
632 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
633 * but it is obsoleted by pmtu discovery).
634 *
635 * Note, that in modern internet, where routing is unreliable
636 * and in each dark corner broken firewalls sit, sending random
637 * errors ordered by their masters even this two messages finally lose
638 * their original sense (even Linux sends invalid PORT_UNREACHs)
639 *
640 * Now we are in compliance with RFCs.
641 * --ANK (980905)
642 */
643
644 if (!sock_owned_by_user(sk) &&
645 inet_test_bit(RECVERR, sk)) {
646 WRITE_ONCE(sk->sk_err, err);
647 sk_error_report(sk);
648 } else { /* Only an error on timeout */
649 WRITE_ONCE(sk->sk_err_soft, err);
650 }
651
652 out:
653 bh_unlock_sock(sk);
654 sock_put(sk);
655 return 0;
656 }
657
658 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
659
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])660 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
661 const struct tcp_ao_hdr *aoh,
662 struct ip_reply_arg *arg, struct tcphdr *reply,
663 __be32 reply_options[REPLY_OPTIONS_LEN])
664 {
665 #ifdef CONFIG_TCP_AO
666 int sdif = tcp_v4_sdif(skb);
667 int dif = inet_iif(skb);
668 int l3index = sdif ? dif : 0;
669 bool allocated_traffic_key;
670 struct tcp_ao_key *key;
671 char *traffic_key;
672 bool drop = true;
673 u32 ao_sne = 0;
674 u8 keyid;
675
676 rcu_read_lock();
677 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
678 &key, &traffic_key, &allocated_traffic_key,
679 &keyid, &ao_sne))
680 goto out;
681
682 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
683 (aoh->rnext_keyid << 8) | keyid);
684 arg->iov[0].iov_len += tcp_ao_len_aligned(key);
685 reply->doff = arg->iov[0].iov_len / 4;
686
687 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
688 key, traffic_key,
689 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
690 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
691 reply, ao_sne))
692 goto out;
693 drop = false;
694 out:
695 rcu_read_unlock();
696 if (allocated_traffic_key)
697 kfree(traffic_key);
698 return drop;
699 #else
700 return true;
701 #endif
702 }
703
704 /*
705 * This routine will send an RST to the other tcp.
706 *
707 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
708 * for reset.
709 * Answer: if a packet caused RST, it is not for a socket
710 * existing in our system, if it is matched to a socket,
711 * it is just duplicate segment or bug in other side's TCP.
712 * So that we build reply only basing on parameters
713 * arrived with segment.
714 * Exception: precedence violation. We do not implement it in any case.
715 */
716
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)717 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
718 enum sk_rst_reason reason)
719 {
720 const struct tcphdr *th = tcp_hdr(skb);
721 struct {
722 struct tcphdr th;
723 __be32 opt[REPLY_OPTIONS_LEN];
724 } rep;
725 const __u8 *md5_hash_location = NULL;
726 const struct tcp_ao_hdr *aoh;
727 struct ip_reply_arg arg;
728 #ifdef CONFIG_TCP_MD5SIG
729 struct tcp_md5sig_key *key = NULL;
730 unsigned char newhash[16];
731 struct sock *sk1 = NULL;
732 #endif
733 u64 transmit_time = 0;
734 struct sock *ctl_sk;
735 struct net *net;
736 u32 txhash = 0;
737
738 /* Never send a reset in response to a reset. */
739 if (th->rst)
740 return;
741
742 /* If sk not NULL, it means we did a successful lookup and incoming
743 * route had to be correct. prequeue might have dropped our dst.
744 */
745 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
746 return;
747
748 /* Swap the send and the receive. */
749 memset(&rep, 0, sizeof(rep));
750 rep.th.dest = th->source;
751 rep.th.source = th->dest;
752 rep.th.doff = sizeof(struct tcphdr) / 4;
753 rep.th.rst = 1;
754
755 if (th->ack) {
756 rep.th.seq = th->ack_seq;
757 } else {
758 rep.th.ack = 1;
759 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
760 skb->len - (th->doff << 2));
761 }
762
763 memset(&arg, 0, sizeof(arg));
764 arg.iov[0].iov_base = (unsigned char *)&rep;
765 arg.iov[0].iov_len = sizeof(rep.th);
766
767 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
768
769 /* Invalid TCP option size or twice included auth */
770 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
771 return;
772
773 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
774 return;
775
776 #ifdef CONFIG_TCP_MD5SIG
777 rcu_read_lock();
778 if (sk && sk_fullsock(sk)) {
779 const union tcp_md5_addr *addr;
780 int l3index;
781
782 /* sdif set, means packet ingressed via a device
783 * in an L3 domain and inet_iif is set to it.
784 */
785 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
786 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
787 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
788 } else if (md5_hash_location) {
789 const union tcp_md5_addr *addr;
790 int sdif = tcp_v4_sdif(skb);
791 int dif = inet_iif(skb);
792 int l3index;
793
794 /*
795 * active side is lost. Try to find listening socket through
796 * source port, and then find md5 key through listening socket.
797 * we are not loose security here:
798 * Incoming packet is checked with md5 hash with finding key,
799 * no RST generated if md5 hash doesn't match.
800 */
801 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
802 th->source, ip_hdr(skb)->daddr,
803 ntohs(th->source), dif, sdif);
804 /* don't send rst if it can't find key */
805 if (!sk1)
806 goto out;
807
808 /* sdif set, means packet ingressed via a device
809 * in an L3 domain and dif is set to it.
810 */
811 l3index = sdif ? dif : 0;
812 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
813 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
814 if (!key)
815 goto out;
816
817 tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
818 if (crypto_memneq(md5_hash_location, newhash, 16))
819 goto out;
820 }
821
822 if (key) {
823 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
824 (TCPOPT_NOP << 16) |
825 (TCPOPT_MD5SIG << 8) |
826 TCPOLEN_MD5SIG);
827 /* Update length and the length the header thinks exists */
828 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
829 rep.th.doff = arg.iov[0].iov_len / 4;
830
831 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
832 key, ip_hdr(skb)->saddr,
833 ip_hdr(skb)->daddr, &rep.th);
834 }
835 #endif
836 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
837 if (rep.opt[0] == 0) {
838 __be32 mrst = mptcp_reset_option(skb);
839
840 if (mrst) {
841 rep.opt[0] = mrst;
842 arg.iov[0].iov_len += sizeof(mrst);
843 rep.th.doff = arg.iov[0].iov_len / 4;
844 }
845 }
846
847 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
848 ip_hdr(skb)->saddr, /* XXX */
849 arg.iov[0].iov_len, IPPROTO_TCP, 0);
850 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
851 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
852
853 /* When socket is gone, all binding information is lost.
854 * routing might fail in this case. No choice here, if we choose to force
855 * input interface, we will misroute in case of asymmetric route.
856 */
857 if (sk)
858 arg.bound_dev_if = sk->sk_bound_dev_if;
859
860 trace_tcp_send_reset(sk, skb, reason);
861
862 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
863 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
864
865 /* ECN bits of TW reset are cleared */
866 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
867 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
868 local_bh_disable();
869 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
870 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
871
872 sock_net_set(ctl_sk, net);
873 if (sk) {
874 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
875 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
876 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
877 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
878 transmit_time = tcp_transmit_time(sk);
879 xfrm_sk_clone_policy(ctl_sk, sk);
880 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
881 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
882 } else {
883 ctl_sk->sk_mark = 0;
884 ctl_sk->sk_priority = 0;
885 }
886 ip_send_unicast_reply(ctl_sk, sk,
887 skb, &TCP_SKB_CB(skb)->header.h4.opt,
888 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
889 &arg, arg.iov[0].iov_len,
890 transmit_time, txhash);
891
892 xfrm_sk_free_policy(ctl_sk);
893 sock_net_set(ctl_sk, &init_net);
894 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
895 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
896 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
897 local_bh_enable();
898
899 #ifdef CONFIG_TCP_MD5SIG
900 out:
901 rcu_read_unlock();
902 #endif
903 }
904
905 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
906 outside socket context is ugly, certainly. What can I do?
907 */
908
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)909 static void tcp_v4_send_ack(const struct sock *sk,
910 struct sk_buff *skb, u32 seq, u32 ack,
911 u32 win, u32 tsval, u32 tsecr, int oif,
912 struct tcp_key *key,
913 int reply_flags, u8 tos, u32 txhash)
914 {
915 const struct tcphdr *th = tcp_hdr(skb);
916 struct {
917 struct tcphdr th;
918 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
919 } rep;
920 struct net *net = sock_net(sk);
921 struct ip_reply_arg arg;
922 struct sock *ctl_sk;
923 u64 transmit_time;
924
925 memset(&rep.th, 0, sizeof(struct tcphdr));
926 memset(&arg, 0, sizeof(arg));
927
928 arg.iov[0].iov_base = (unsigned char *)&rep;
929 arg.iov[0].iov_len = sizeof(rep.th);
930 if (tsecr) {
931 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
932 (TCPOPT_TIMESTAMP << 8) |
933 TCPOLEN_TIMESTAMP);
934 rep.opt[1] = htonl(tsval);
935 rep.opt[2] = htonl(tsecr);
936 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
937 }
938
939 /* Swap the send and the receive. */
940 rep.th.dest = th->source;
941 rep.th.source = th->dest;
942 rep.th.doff = arg.iov[0].iov_len / 4;
943 rep.th.seq = htonl(seq);
944 rep.th.ack_seq = htonl(ack);
945 rep.th.ack = 1;
946 rep.th.window = htons(win);
947
948 #ifdef CONFIG_TCP_MD5SIG
949 if (tcp_key_is_md5(key)) {
950 int offset = (tsecr) ? 3 : 0;
951
952 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
953 (TCPOPT_NOP << 16) |
954 (TCPOPT_MD5SIG << 8) |
955 TCPOLEN_MD5SIG);
956 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
957 rep.th.doff = arg.iov[0].iov_len/4;
958
959 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
960 key->md5_key, ip_hdr(skb)->saddr,
961 ip_hdr(skb)->daddr, &rep.th);
962 }
963 #endif
964 #ifdef CONFIG_TCP_AO
965 if (tcp_key_is_ao(key)) {
966 int offset = (tsecr) ? 3 : 0;
967
968 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
969 (tcp_ao_len(key->ao_key) << 16) |
970 (key->ao_key->sndid << 8) |
971 key->rcv_next);
972 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
973 rep.th.doff = arg.iov[0].iov_len / 4;
974
975 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
976 key->ao_key, key->traffic_key,
977 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
978 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
979 &rep.th, key->sne);
980 }
981 #endif
982 arg.flags = reply_flags;
983 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
984 ip_hdr(skb)->saddr, /* XXX */
985 arg.iov[0].iov_len, IPPROTO_TCP, 0);
986 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
987 if (oif)
988 arg.bound_dev_if = oif;
989 arg.tos = tos;
990 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
991 local_bh_disable();
992 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
993 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
994 sock_net_set(ctl_sk, net);
995 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
996 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
997 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
998 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
999 transmit_time = tcp_transmit_time(sk);
1000 ip_send_unicast_reply(ctl_sk, sk,
1001 skb, &TCP_SKB_CB(skb)->header.h4.opt,
1002 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1003 &arg, arg.iov[0].iov_len,
1004 transmit_time, txhash);
1005
1006 sock_net_set(ctl_sk, &init_net);
1007 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1008 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1009 local_bh_enable();
1010 }
1011
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb,enum tcp_tw_status tw_status)1012 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1013 enum tcp_tw_status tw_status)
1014 {
1015 struct inet_timewait_sock *tw = inet_twsk(sk);
1016 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1017 struct tcp_key key = {};
1018 u8 tos = tw->tw_tos;
1019
1020 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1021 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1022 * being placed in a different service queues (Classic rather than L4S)
1023 */
1024 if (tw_status == TCP_TW_ACK_OOW)
1025 tos &= ~INET_ECN_MASK;
1026
1027 #ifdef CONFIG_TCP_AO
1028 struct tcp_ao_info *ao_info;
1029
1030 if (static_branch_unlikely(&tcp_ao_needed.key)) {
1031 /* FIXME: the segment to-be-acked is not verified yet */
1032 ao_info = rcu_dereference(tcptw->ao_info);
1033 if (ao_info) {
1034 const struct tcp_ao_hdr *aoh;
1035
1036 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1037 inet_twsk_put(tw);
1038 return;
1039 }
1040
1041 if (aoh)
1042 key.ao_key = tcp_ao_established_key(sk, ao_info,
1043 aoh->rnext_keyid, -1);
1044 }
1045 }
1046 if (key.ao_key) {
1047 struct tcp_ao_key *rnext_key;
1048
1049 key.traffic_key = snd_other_key(key.ao_key);
1050 key.sne = READ_ONCE(ao_info->snd_sne);
1051 rnext_key = READ_ONCE(ao_info->rnext_key);
1052 key.rcv_next = rnext_key->rcvid;
1053 key.type = TCP_KEY_AO;
1054 #else
1055 if (0) {
1056 #endif
1057 } else if (static_branch_tcp_md5()) {
1058 key.md5_key = tcp_twsk_md5_key(tcptw);
1059 if (key.md5_key)
1060 key.type = TCP_KEY_MD5;
1061 }
1062
1063 tcp_v4_send_ack(sk, skb,
1064 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1065 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1066 tcp_tw_tsval(tcptw),
1067 READ_ONCE(tcptw->tw_ts_recent),
1068 tw->tw_bound_dev_if, &key,
1069 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1070 tos,
1071 tw->tw_txhash);
1072
1073 inet_twsk_put(tw);
1074 }
1075
1076 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1077 struct request_sock *req)
1078 {
1079 struct tcp_key key = {};
1080
1081 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1082 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1083 */
1084 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1085 tcp_sk(sk)->snd_nxt;
1086
1087 #ifdef CONFIG_TCP_AO
1088 if (static_branch_unlikely(&tcp_ao_needed.key) &&
1089 tcp_rsk_used_ao(req)) {
1090 const union tcp_md5_addr *addr;
1091 const struct tcp_ao_hdr *aoh;
1092 int l3index;
1093
1094 /* Invalid TCP option size or twice included auth */
1095 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1096 return;
1097 if (!aoh)
1098 return;
1099
1100 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1101 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1102 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1103 aoh->rnext_keyid, -1);
1104 if (unlikely(!key.ao_key)) {
1105 /* Send ACK with any matching MKT for the peer */
1106 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1107 /* Matching key disappeared (user removed the key?)
1108 * let the handshake timeout.
1109 */
1110 if (!key.ao_key) {
1111 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1112 addr,
1113 ntohs(tcp_hdr(skb)->source),
1114 &ip_hdr(skb)->daddr,
1115 ntohs(tcp_hdr(skb)->dest));
1116 return;
1117 }
1118 }
1119 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1120 if (!key.traffic_key)
1121 return;
1122
1123 key.type = TCP_KEY_AO;
1124 key.rcv_next = aoh->keyid;
1125 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1126 #else
1127 if (0) {
1128 #endif
1129 } else if (static_branch_tcp_md5()) {
1130 const union tcp_md5_addr *addr;
1131 int l3index;
1132
1133 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1134 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1135 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1136 if (key.md5_key)
1137 key.type = TCP_KEY_MD5;
1138 }
1139
1140 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1141 tcp_v4_send_ack(sk, skb, seq,
1142 tcp_rsk(req)->rcv_nxt,
1143 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1144 tcp_rsk_tsval(tcp_rsk(req)),
1145 req->ts_recent,
1146 0, &key,
1147 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1148 ip_hdr(skb)->tos & ~INET_ECN_MASK,
1149 READ_ONCE(tcp_rsk(req)->txhash));
1150 if (tcp_key_is_ao(&key))
1151 kfree(key.traffic_key);
1152 }
1153
1154 /*
1155 * Send a SYN-ACK after having received a SYN.
1156 * This still operates on a request_sock only, not on a big
1157 * socket.
1158 */
1159 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1160 struct flowi *fl,
1161 struct request_sock *req,
1162 struct tcp_fastopen_cookie *foc,
1163 enum tcp_synack_type synack_type,
1164 struct sk_buff *syn_skb)
1165 {
1166 struct inet_request_sock *ireq = inet_rsk(req);
1167 struct flowi4 fl4;
1168 int err = -1;
1169 struct sk_buff *skb;
1170 u8 tos;
1171
1172 /* First, grab a route. */
1173 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1174 return -1;
1175
1176 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1177
1178 if (skb) {
1179 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1180 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1181
1182 tos = READ_ONCE(inet_sk(sk)->tos);
1183
1184 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1185 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1186 (tos & INET_ECN_MASK);
1187
1188 if (!INET_ECN_is_capable(tos) &&
1189 tcp_bpf_ca_needs_ecn((struct sock *)req))
1190 tos |= INET_ECN_ECT_0;
1191
1192 rcu_read_lock();
1193 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1194 ireq->ir_rmt_addr,
1195 rcu_dereference(ireq->ireq_opt),
1196 tos);
1197 rcu_read_unlock();
1198 err = net_xmit_eval(err);
1199 }
1200
1201 return err;
1202 }
1203
1204 /*
1205 * IPv4 request_sock destructor.
1206 */
1207 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1208 {
1209 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1210 }
1211
1212 #ifdef CONFIG_TCP_MD5SIG
1213 /*
1214 * RFC2385 MD5 checksumming requires a mapping of
1215 * IP address->MD5 Key.
1216 * We need to maintain these in the sk structure.
1217 */
1218
1219 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1220
1221 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1222 {
1223 if (!old)
1224 return true;
1225
1226 /* l3index always overrides non-l3index */
1227 if (old->l3index && new->l3index == 0)
1228 return false;
1229 if (old->l3index == 0 && new->l3index)
1230 return true;
1231
1232 return old->prefixlen < new->prefixlen;
1233 }
1234
1235 /* Find the Key structure for an address. */
1236 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1237 const union tcp_md5_addr *addr,
1238 int family, bool any_l3index)
1239 {
1240 const struct tcp_sock *tp = tcp_sk(sk);
1241 struct tcp_md5sig_key *key;
1242 const struct tcp_md5sig_info *md5sig;
1243 __be32 mask;
1244 struct tcp_md5sig_key *best_match = NULL;
1245 bool match;
1246
1247 /* caller either holds rcu_read_lock() or socket lock */
1248 md5sig = rcu_dereference_check(tp->md5sig_info,
1249 lockdep_sock_is_held(sk));
1250 if (!md5sig)
1251 return NULL;
1252
1253 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1254 lockdep_sock_is_held(sk)) {
1255 if (key->family != family)
1256 continue;
1257 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1258 key->l3index != l3index)
1259 continue;
1260 if (family == AF_INET) {
1261 mask = inet_make_mask(key->prefixlen);
1262 match = (key->addr.a4.s_addr & mask) ==
1263 (addr->a4.s_addr & mask);
1264 #if IS_ENABLED(CONFIG_IPV6)
1265 } else if (family == AF_INET6) {
1266 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1267 key->prefixlen);
1268 #endif
1269 } else {
1270 match = false;
1271 }
1272
1273 if (match && better_md5_match(best_match, key))
1274 best_match = key;
1275 }
1276 return best_match;
1277 }
1278
1279 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1280 const union tcp_md5_addr *addr,
1281 int family, u8 prefixlen,
1282 int l3index, u8 flags)
1283 {
1284 const struct tcp_sock *tp = tcp_sk(sk);
1285 struct tcp_md5sig_key *key;
1286 unsigned int size = sizeof(struct in_addr);
1287 const struct tcp_md5sig_info *md5sig;
1288
1289 /* caller either holds rcu_read_lock() or socket lock */
1290 md5sig = rcu_dereference_check(tp->md5sig_info,
1291 lockdep_sock_is_held(sk));
1292 if (!md5sig)
1293 return NULL;
1294 #if IS_ENABLED(CONFIG_IPV6)
1295 if (family == AF_INET6)
1296 size = sizeof(struct in6_addr);
1297 #endif
1298 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1299 lockdep_sock_is_held(sk)) {
1300 if (key->family != family)
1301 continue;
1302 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1303 continue;
1304 if (key->l3index != l3index)
1305 continue;
1306 if (!memcmp(&key->addr, addr, size) &&
1307 key->prefixlen == prefixlen)
1308 return key;
1309 }
1310 return NULL;
1311 }
1312
1313 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1314 const struct sock *addr_sk)
1315 {
1316 const union tcp_md5_addr *addr;
1317 int l3index;
1318
1319 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1320 addr_sk->sk_bound_dev_if);
1321 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1322 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1323 }
1324
1325 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1326 {
1327 struct tcp_sock *tp = tcp_sk(sk);
1328 struct tcp_md5sig_info *md5sig;
1329
1330 md5sig = kmalloc_obj(*md5sig, gfp);
1331 if (!md5sig)
1332 return -ENOMEM;
1333
1334 sk_gso_disable(sk);
1335 INIT_HLIST_HEAD(&md5sig->head);
1336 rcu_assign_pointer(tp->md5sig_info, md5sig);
1337 return 0;
1338 }
1339
1340 /* This can be called on a newly created socket, from other files */
1341 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1342 int family, u8 prefixlen, int l3index, u8 flags,
1343 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1344 {
1345 /* Add Key to the list */
1346 struct tcp_md5sig_key *key;
1347 struct tcp_sock *tp = tcp_sk(sk);
1348 struct tcp_md5sig_info *md5sig;
1349
1350 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1351 if (key) {
1352 /* Pre-existing entry - just update that one.
1353 * Note that the key might be used concurrently.
1354 * data_race() is telling kcsan that we do not care of
1355 * key mismatches, since changing MD5 key on live flows
1356 * can lead to packet drops.
1357 */
1358 data_race(memcpy(key->key, newkey, newkeylen));
1359
1360 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1361 * Also note that a reader could catch new key->keylen value
1362 * but old key->key[], this is the reason we use __GFP_ZERO
1363 * at sock_kmalloc() time below these lines.
1364 */
1365 WRITE_ONCE(key->keylen, newkeylen);
1366
1367 return 0;
1368 }
1369
1370 md5sig = rcu_dereference_protected(tp->md5sig_info,
1371 lockdep_sock_is_held(sk));
1372
1373 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1374 if (!key)
1375 return -ENOMEM;
1376
1377 memcpy(key->key, newkey, newkeylen);
1378 key->keylen = newkeylen;
1379 key->family = family;
1380 key->prefixlen = prefixlen;
1381 key->l3index = l3index;
1382 key->flags = flags;
1383 memcpy(&key->addr, addr,
1384 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1385 sizeof(struct in_addr));
1386 hlist_add_head_rcu(&key->node, &md5sig->head);
1387 return 0;
1388 }
1389
1390 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1391 int family, u8 prefixlen, int l3index, u8 flags,
1392 const u8 *newkey, u8 newkeylen)
1393 {
1394 struct tcp_sock *tp = tcp_sk(sk);
1395
1396 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1397 if (fips_enabled) {
1398 pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1399 return -EOPNOTSUPP;
1400 }
1401
1402 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1403 return -ENOMEM;
1404
1405 if (!static_branch_inc(&tcp_md5_needed.key)) {
1406 struct tcp_md5sig_info *md5sig;
1407
1408 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1409 rcu_assign_pointer(tp->md5sig_info, NULL);
1410 kfree_rcu(md5sig, rcu);
1411 return -EUSERS;
1412 }
1413 }
1414
1415 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1416 newkey, newkeylen, GFP_KERNEL);
1417 }
1418
1419 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1420 int family, u8 prefixlen, int l3index,
1421 struct tcp_md5sig_key *key)
1422 {
1423 struct tcp_sock *tp = tcp_sk(sk);
1424
1425 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1426
1427 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1428 return -ENOMEM;
1429
1430 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1431 struct tcp_md5sig_info *md5sig;
1432
1433 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1434 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1435 rcu_assign_pointer(tp->md5sig_info, NULL);
1436 kfree_rcu(md5sig, rcu);
1437 return -EUSERS;
1438 }
1439 }
1440
1441 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1442 key->flags, key->key, key->keylen,
1443 sk_gfp_mask(sk, GFP_ATOMIC));
1444 }
1445
1446 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1447 u8 prefixlen, int l3index, u8 flags)
1448 {
1449 struct tcp_md5sig_key *key;
1450
1451 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1452 if (!key)
1453 return -ENOENT;
1454 hlist_del_rcu(&key->node);
1455 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1456 kfree_rcu(key, rcu);
1457 return 0;
1458 }
1459
1460 void tcp_clear_md5_list(struct sock *sk)
1461 {
1462 struct tcp_sock *tp = tcp_sk(sk);
1463 struct tcp_md5sig_key *key;
1464 struct hlist_node *n;
1465 struct tcp_md5sig_info *md5sig;
1466
1467 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1468
1469 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1470 hlist_del(&key->node);
1471 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1472 kfree(key);
1473 }
1474 }
1475
1476 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1477 sockptr_t optval, int optlen)
1478 {
1479 struct tcp_md5sig cmd;
1480 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1481 const union tcp_md5_addr *addr;
1482 u8 prefixlen = 32;
1483 int l3index = 0;
1484 bool l3flag;
1485 u8 flags;
1486
1487 if (optlen < sizeof(cmd))
1488 return -EINVAL;
1489
1490 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1491 return -EFAULT;
1492
1493 if (sin->sin_family != AF_INET)
1494 return -EINVAL;
1495
1496 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1497 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1498
1499 if (optname == TCP_MD5SIG_EXT &&
1500 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1501 prefixlen = cmd.tcpm_prefixlen;
1502 if (prefixlen > 32)
1503 return -EINVAL;
1504 }
1505
1506 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1507 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1508 struct net_device *dev;
1509
1510 rcu_read_lock();
1511 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1512 if (dev && netif_is_l3_master(dev))
1513 l3index = dev->ifindex;
1514
1515 rcu_read_unlock();
1516
1517 /* ok to reference set/not set outside of rcu;
1518 * right now device MUST be an L3 master
1519 */
1520 if (!dev || !l3index)
1521 return -EINVAL;
1522 }
1523
1524 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1525
1526 if (!cmd.tcpm_keylen)
1527 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1528
1529 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1530 return -EINVAL;
1531
1532 /* Don't allow keys for peers that have a matching TCP-AO key.
1533 * See the comment in tcp_ao_add_cmd()
1534 */
1535 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1536 return -EKEYREJECTED;
1537
1538 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1539 cmd.tcpm_key, cmd.tcpm_keylen);
1540 }
1541
1542 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1543 __be32 daddr, __be32 saddr,
1544 const struct tcphdr *th, int nbytes)
1545 {
1546 struct {
1547 struct tcp4_pseudohdr ip;
1548 struct tcphdr tcp;
1549 } h;
1550
1551 h.ip.saddr = saddr;
1552 h.ip.daddr = daddr;
1553 h.ip.pad = 0;
1554 h.ip.protocol = IPPROTO_TCP;
1555 h.ip.len = cpu_to_be16(nbytes);
1556 h.tcp = *th;
1557 h.tcp.check = 0;
1558 md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
1559 }
1560
1561 static noinline_for_stack void
1562 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1563 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1564 {
1565 struct md5_ctx ctx;
1566
1567 md5_init(&ctx);
1568 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
1569 tcp_md5_hash_key(&ctx, key);
1570 md5_final(&ctx, md5_hash);
1571 }
1572
1573 noinline_for_stack void
1574 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1575 const struct sock *sk, const struct sk_buff *skb)
1576 {
1577 const struct tcphdr *th = tcp_hdr(skb);
1578 __be32 saddr, daddr;
1579 struct md5_ctx ctx;
1580
1581 if (sk) { /* valid for establish/request sockets */
1582 saddr = sk->sk_rcv_saddr;
1583 daddr = sk->sk_daddr;
1584 } else {
1585 const struct iphdr *iph = ip_hdr(skb);
1586 saddr = iph->saddr;
1587 daddr = iph->daddr;
1588 }
1589
1590 md5_init(&ctx);
1591 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
1592 tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
1593 tcp_md5_hash_key(&ctx, key);
1594 md5_final(&ctx, md5_hash);
1595 }
1596
1597 #endif
1598
1599 static void tcp_v4_init_req(struct request_sock *req,
1600 const struct sock *sk_listener,
1601 struct sk_buff *skb)
1602 {
1603 struct inet_request_sock *ireq = inet_rsk(req);
1604 struct net *net = sock_net(sk_listener);
1605
1606 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1607 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1608 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1609 }
1610
1611 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1612 struct sk_buff *skb,
1613 struct flowi *fl,
1614 struct request_sock *req,
1615 u32 tw_isn)
1616 {
1617 tcp_v4_init_req(req, sk, skb);
1618
1619 if (security_inet_conn_request(sk, skb, req))
1620 return NULL;
1621
1622 return inet_csk_route_req(sk, &fl->u.ip4, req);
1623 }
1624
1625 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1626 .family = PF_INET,
1627 .obj_size = sizeof(struct tcp_request_sock),
1628 .send_ack = tcp_v4_reqsk_send_ack,
1629 .destructor = tcp_v4_reqsk_destructor,
1630 .send_reset = tcp_v4_send_reset,
1631 };
1632
1633 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1634 .mss_clamp = TCP_MSS_DEFAULT,
1635 #ifdef CONFIG_TCP_MD5SIG
1636 .req_md5_lookup = tcp_v4_md5_lookup,
1637 .calc_md5_hash = tcp_v4_md5_hash_skb,
1638 #endif
1639 #ifdef CONFIG_TCP_AO
1640 .ao_lookup = tcp_v4_ao_lookup_rsk,
1641 .ao_calc_key = tcp_v4_ao_calc_key_rsk,
1642 .ao_synack_hash = tcp_v4_ao_synack_hash,
1643 #endif
1644 #ifdef CONFIG_SYN_COOKIES
1645 .cookie_init_seq = cookie_v4_init_sequence,
1646 #endif
1647 .route_req = tcp_v4_route_req,
1648 .init_seq_and_ts_off = tcp_v4_init_seq_and_ts_off,
1649 .send_synack = tcp_v4_send_synack,
1650 };
1651
1652 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1653 {
1654 /* Never answer to SYNs send to broadcast or multicast */
1655 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1656 goto drop;
1657
1658 return tcp_conn_request(&tcp_request_sock_ops,
1659 &tcp_request_sock_ipv4_ops, sk, skb);
1660
1661 drop:
1662 tcp_listendrop(sk);
1663 return 0;
1664 }
1665
1666
1667 /*
1668 * The three way handshake has completed - we got a valid synack -
1669 * now create the new socket.
1670 */
1671 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1672 struct request_sock *req,
1673 struct dst_entry *dst,
1674 struct request_sock *req_unhash,
1675 bool *own_req,
1676 void (*opt_child_init)(struct sock *newsk,
1677 const struct sock *sk))
1678 {
1679 struct inet_request_sock *ireq;
1680 bool found_dup_sk = false;
1681 struct inet_sock *newinet;
1682 struct tcp_sock *newtp;
1683 struct sock *newsk;
1684 #ifdef CONFIG_TCP_MD5SIG
1685 const union tcp_md5_addr *addr;
1686 struct tcp_md5sig_key *key;
1687 int l3index;
1688 #endif
1689 struct ip_options_rcu *inet_opt;
1690
1691 if (sk_acceptq_is_full(sk))
1692 goto exit_overflow;
1693
1694 newsk = tcp_create_openreq_child(sk, req, skb);
1695 if (!newsk)
1696 goto exit_nonewsk;
1697
1698 newsk->sk_gso_type = SKB_GSO_TCPV4;
1699 inet_sk_rx_dst_set(newsk, skb);
1700
1701 newtp = tcp_sk(newsk);
1702 newinet = inet_sk(newsk);
1703 ireq = inet_rsk(req);
1704 inet_opt = rcu_dereference(ireq->ireq_opt);
1705 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1706 newinet->mc_index = inet_iif(skb);
1707 newinet->mc_ttl = ip_hdr(skb)->ttl;
1708 newinet->rcv_tos = ip_hdr(skb)->tos;
1709 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1710 if (inet_opt)
1711 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1712 atomic_set(&newinet->inet_id, get_random_u16());
1713
1714 /* Set ToS of the new socket based upon the value of incoming SYN.
1715 * ECT bits are set later in tcp_init_transfer().
1716 */
1717 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1718 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1719
1720 if (!dst) {
1721 dst = inet_csk_route_child_sock(sk, newsk, req);
1722 if (!dst)
1723 goto put_and_exit;
1724 } else {
1725 /* syncookie case : see end of cookie_v4_check() */
1726 }
1727 sk_setup_caps(newsk, dst);
1728
1729 #if IS_ENABLED(CONFIG_IPV6)
1730 if (opt_child_init)
1731 opt_child_init(newsk, sk);
1732 #endif
1733 tcp_ca_openreq_child(newsk, dst);
1734
1735 tcp_sync_mss(newsk, dst4_mtu(dst));
1736 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1737
1738 tcp_initialize_rcv_mss(newsk);
1739
1740 #ifdef CONFIG_TCP_MD5SIG
1741 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1742 /* Copy over the MD5 key from the original socket */
1743 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1744 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1745 if (key && !tcp_rsk_used_ao(req)) {
1746 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1747 goto put_and_exit;
1748 sk_gso_disable(newsk);
1749 }
1750 #endif
1751 #ifdef CONFIG_TCP_AO
1752 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1753 goto put_and_exit; /* OOM, release back memory */
1754 #endif
1755
1756 if (__inet_inherit_port(sk, newsk) < 0)
1757 goto put_and_exit;
1758 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1759 &found_dup_sk);
1760 if (likely(*own_req)) {
1761 tcp_move_syn(newtp, req);
1762 ireq->ireq_opt = NULL;
1763 } else {
1764 newinet->inet_opt = NULL;
1765
1766 if (!req_unhash && found_dup_sk) {
1767 /* This code path should only be executed in the
1768 * syncookie case only
1769 */
1770 bh_unlock_sock(newsk);
1771 sock_put(newsk);
1772 newsk = NULL;
1773 }
1774 }
1775 return newsk;
1776
1777 exit_overflow:
1778 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1779 exit_nonewsk:
1780 dst_release(dst);
1781 exit:
1782 tcp_listendrop(sk);
1783 return NULL;
1784 put_and_exit:
1785 newinet->inet_opt = NULL;
1786 inet_csk_prepare_forced_close(newsk);
1787 tcp_done(newsk);
1788 goto exit;
1789 }
1790
1791 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1792 {
1793 #ifdef CONFIG_SYN_COOKIES
1794 const struct tcphdr *th = tcp_hdr(skb);
1795
1796 if (!th->syn)
1797 sk = cookie_v4_check(sk, skb);
1798 #endif
1799 return sk;
1800 }
1801
1802 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1803 struct tcphdr *th, u32 *cookie)
1804 {
1805 u16 mss = 0;
1806 #ifdef CONFIG_SYN_COOKIES
1807 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1808 &tcp_request_sock_ipv4_ops, sk, th);
1809 if (mss) {
1810 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1811 tcp_synq_overflow(sk);
1812 }
1813 #endif
1814 return mss;
1815 }
1816
1817 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1818 u32));
1819 /* The socket must have it's spinlock held when we get
1820 * here, unless it is a TCP_LISTEN socket.
1821 *
1822 * We have a potential double-lock case here, so even when
1823 * doing backlog processing we use the BH locking scheme.
1824 * This is because we cannot sleep with the original spinlock
1825 * held.
1826 */
1827 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1828 {
1829 enum skb_drop_reason reason;
1830
1831 reason = psp_sk_rx_policy_check(sk, skb);
1832 if (reason)
1833 goto err_discard;
1834
1835 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1836 struct dst_entry *dst;
1837
1838 dst = rcu_dereference_protected(sk->sk_rx_dst,
1839 lockdep_sock_is_held(sk));
1840
1841 sock_rps_save_rxhash(sk, skb);
1842 sk_mark_napi_id(sk, skb);
1843 if (dst && unlikely(dst != skb_dst(skb))) {
1844 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1845 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1846 dst, 0)) {
1847 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1848 dst_release(dst);
1849 }
1850 }
1851 tcp_rcv_established(sk, skb);
1852 return 0;
1853 }
1854
1855 if (tcp_checksum_complete(skb))
1856 goto csum_err;
1857
1858 if (sk->sk_state == TCP_LISTEN) {
1859 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1860
1861 if (!nsk)
1862 return 0;
1863 if (nsk != sk) {
1864 reason = tcp_child_process(sk, nsk, skb);
1865 sock_put(nsk);
1866 if (reason)
1867 goto reset;
1868 return 0;
1869 }
1870 } else
1871 sock_rps_save_rxhash(sk, skb);
1872
1873 reason = tcp_rcv_state_process(sk, skb);
1874 if (reason)
1875 goto reset;
1876 return 0;
1877
1878 reset:
1879 tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
1880 discard:
1881 sk_skb_reason_drop(sk, skb, reason);
1882 /* Be careful here. If this function gets more complicated and
1883 * gcc suffers from register pressure on the x86, sk (in %ebx)
1884 * might be destroyed here. This current version compiles correctly,
1885 * but you have been warned.
1886 */
1887 return 0;
1888
1889 csum_err:
1890 reason = SKB_DROP_REASON_TCP_CSUM;
1891 trace_tcp_bad_csum(skb);
1892 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1893 err_discard:
1894 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1895 goto discard;
1896 }
1897 EXPORT_SYMBOL(tcp_v4_do_rcv);
1898
1899 enum skb_drop_reason tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1900 {
1901 u32 tail_gso_size, tail_gso_segs;
1902 struct skb_shared_info *shinfo;
1903 const struct tcphdr *th;
1904 struct tcphdr *thtail;
1905 struct sk_buff *tail;
1906 unsigned int hdrlen;
1907 bool fragstolen;
1908 u32 gso_segs;
1909 u32 gso_size;
1910 u64 limit;
1911 int delta;
1912 int err;
1913
1914 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1915 * we can fix skb->truesize to its real value to avoid future drops.
1916 * This is valid because skb is not yet charged to the socket.
1917 * It has been noticed pure SACK packets were sometimes dropped
1918 * (if cooked by drivers without copybreak feature).
1919 */
1920 skb_condense(skb);
1921
1922 tcp_cleanup_skb(skb);
1923
1924 if (unlikely(tcp_checksum_complete(skb))) {
1925 bh_unlock_sock(sk);
1926 trace_tcp_bad_csum(skb);
1927 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1928 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1929 return SKB_DROP_REASON_TCP_CSUM;
1930 }
1931
1932 /* Attempt coalescing to last skb in backlog, even if we are
1933 * above the limits.
1934 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1935 */
1936 th = (const struct tcphdr *)skb->data;
1937 hdrlen = th->doff * 4;
1938
1939 tail = sk->sk_backlog.tail;
1940 if (!tail)
1941 goto no_coalesce;
1942 thtail = (struct tcphdr *)tail->data;
1943
1944 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1945 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1946 ((TCP_SKB_CB(tail)->tcp_flags |
1947 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1948 !((TCP_SKB_CB(tail)->tcp_flags &
1949 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1950 ((TCP_SKB_CB(tail)->tcp_flags ^
1951 TCP_SKB_CB(skb)->tcp_flags) &
1952 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
1953 !tcp_skb_can_collapse_rx(tail, skb) ||
1954 thtail->doff != th->doff ||
1955 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
1956 /* prior to PSP Rx policy check, retain exact PSP metadata */
1957 psp_skb_coalesce_diff(tail, skb))
1958 goto no_coalesce;
1959
1960 __skb_pull(skb, hdrlen);
1961
1962 shinfo = skb_shinfo(skb);
1963 gso_size = shinfo->gso_size ?: skb->len;
1964 gso_segs = shinfo->gso_segs ?: 1;
1965
1966 shinfo = skb_shinfo(tail);
1967 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1968 tail_gso_segs = shinfo->gso_segs ?: 1;
1969
1970 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1971 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1972
1973 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1974 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1975 thtail->window = th->window;
1976 }
1977
1978 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1979 * thtail->fin, so that the fast path in tcp_rcv_established()
1980 * is not entered if we append a packet with a FIN.
1981 * SYN, RST, URG are not present.
1982 * ACK is set on both packets.
1983 * PSH : we do not really care in TCP stack,
1984 * at least for 'GRO' packets.
1985 */
1986 thtail->fin |= th->fin;
1987 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1988
1989 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1990 TCP_SKB_CB(tail)->has_rxtstamp = true;
1991 tail->tstamp = skb->tstamp;
1992 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1993 }
1994
1995 /* Not as strict as GRO. We only need to carry mss max value */
1996 shinfo->gso_size = max(gso_size, tail_gso_size);
1997 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1998
1999 sk->sk_backlog.len += delta;
2000 __NET_INC_STATS(sock_net(sk),
2001 LINUX_MIB_TCPBACKLOGCOALESCE);
2002 kfree_skb_partial(skb, fragstolen);
2003 return SKB_NOT_DROPPED_YET;
2004 }
2005 __skb_push(skb, hdrlen);
2006
2007 no_coalesce:
2008 /* sk->sk_backlog.len is reset only at the end of __release_sock().
2009 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2010 * sk_rcvbuf in normal conditions.
2011 */
2012 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2013
2014 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2015
2016 /* Only socket owner can try to collapse/prune rx queues
2017 * to reduce memory overhead, so add a little headroom here.
2018 * Few sockets backlog are possibly concurrently non empty.
2019 */
2020 limit += 64 * 1024;
2021
2022 limit = min_t(u64, limit, UINT_MAX);
2023
2024 err = sk_add_backlog(sk, skb, limit);
2025 if (unlikely(err)) {
2026 bh_unlock_sock(sk);
2027 if (err == -ENOMEM) {
2028 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2029 return SKB_DROP_REASON_PFMEMALLOC;
2030 }
2031 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2032 return SKB_DROP_REASON_SOCKET_BACKLOG;
2033 }
2034 return SKB_NOT_DROPPED_YET;
2035 }
2036
2037 static void tcp_v4_restore_cb(struct sk_buff *skb)
2038 {
2039 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2040 sizeof(struct inet_skb_parm));
2041 }
2042
2043 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2044 const struct tcphdr *th)
2045 {
2046 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2047 * barrier() makes sure compiler wont play fool^Waliasing games.
2048 */
2049 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2050 sizeof(struct inet_skb_parm));
2051 barrier();
2052
2053 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2054 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2055 skb->len - th->doff * 4);
2056 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2057 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2058 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2059 TCP_SKB_CB(skb)->sacked = 0;
2060 TCP_SKB_CB(skb)->has_rxtstamp =
2061 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2062 }
2063
2064 /*
2065 * From tcp_input.c
2066 */
2067
2068 int tcp_v4_rcv(struct sk_buff *skb)
2069 {
2070 struct net *net = dev_net_rcu(skb->dev);
2071 enum skb_drop_reason drop_reason;
2072 enum tcp_tw_status tw_status;
2073 int sdif = inet_sdif(skb);
2074 int dif = inet_iif(skb);
2075 const struct iphdr *iph;
2076 const struct tcphdr *th;
2077 struct sock *sk = NULL;
2078 bool refcounted;
2079 int ret;
2080 u32 isn;
2081
2082 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2083 if (skb->pkt_type != PACKET_HOST)
2084 goto discard_it;
2085
2086 /* Count it even if it's bad */
2087 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2088
2089 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2090 goto discard_it;
2091
2092 th = (const struct tcphdr *)skb->data;
2093
2094 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2095 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2096 goto bad_packet;
2097 }
2098 if (!pskb_may_pull(skb, th->doff * 4))
2099 goto discard_it;
2100
2101 /* An explanation is required here, I think.
2102 * Packet length and doff are validated by header prediction,
2103 * provided case of th->doff==0 is eliminated.
2104 * So, we defer the checks. */
2105
2106 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2107 goto csum_error;
2108
2109 th = (const struct tcphdr *)skb->data;
2110 iph = ip_hdr(skb);
2111 lookup:
2112 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2113 th->dest, sdif, &refcounted);
2114 if (!sk)
2115 goto no_tcp_socket;
2116
2117 if (sk->sk_state == TCP_TIME_WAIT)
2118 goto do_time_wait;
2119
2120 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2121 struct request_sock *req = inet_reqsk(sk);
2122 bool req_stolen = false;
2123 struct sock *nsk;
2124
2125 sk = req->rsk_listener;
2126 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2127 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2128 else
2129 drop_reason = tcp_inbound_hash(sk, req, skb,
2130 &iph->saddr, &iph->daddr,
2131 AF_INET, dif, sdif);
2132 if (unlikely(drop_reason)) {
2133 sk_drops_skbadd(sk, skb);
2134 reqsk_put(req);
2135 goto discard_it;
2136 }
2137 if (tcp_checksum_complete(skb)) {
2138 reqsk_put(req);
2139 goto csum_error;
2140 }
2141 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2142 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2143 if (!nsk) {
2144 inet_csk_reqsk_queue_drop_and_put(sk, req);
2145 goto lookup;
2146 }
2147 sk = nsk;
2148 /* reuseport_migrate_sock() has already held one sk_refcnt
2149 * before returning.
2150 */
2151 } else {
2152 /* We own a reference on the listener, increase it again
2153 * as we might lose it too soon.
2154 */
2155 sock_hold(sk);
2156 }
2157 refcounted = true;
2158 nsk = NULL;
2159 drop_reason = tcp_filter(sk, skb);
2160 if (!drop_reason) {
2161 th = (const struct tcphdr *)skb->data;
2162 iph = ip_hdr(skb);
2163 tcp_v4_fill_cb(skb, iph, th);
2164 nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2165 &drop_reason);
2166 }
2167 if (!nsk) {
2168 reqsk_put(req);
2169 if (req_stolen) {
2170 /* Another cpu got exclusive access to req
2171 * and created a full blown socket.
2172 * Try to feed this packet to this socket
2173 * instead of discarding it.
2174 */
2175 tcp_v4_restore_cb(skb);
2176 sock_put(sk);
2177 goto lookup;
2178 }
2179 goto discard_and_relse;
2180 }
2181 nf_reset_ct(skb);
2182 if (nsk == sk) {
2183 reqsk_put(req);
2184 tcp_v4_restore_cb(skb);
2185 } else {
2186 drop_reason = tcp_child_process(sk, nsk, skb);
2187 if (drop_reason) {
2188 enum sk_rst_reason rst_reason;
2189
2190 rst_reason = sk_rst_convert_drop_reason(drop_reason);
2191 tcp_v4_send_reset(nsk, skb, rst_reason);
2192 sock_put(nsk);
2193 goto discard_and_relse;
2194 }
2195 sock_put(nsk);
2196 sock_put(sk);
2197 return 0;
2198 }
2199 }
2200
2201 process:
2202 if (static_branch_unlikely(&ip4_min_ttl)) {
2203 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2204 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2205 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2206 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2207 goto discard_and_relse;
2208 }
2209 }
2210
2211 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2212 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2213 goto discard_and_relse;
2214 }
2215
2216 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2217 AF_INET, dif, sdif);
2218 if (drop_reason)
2219 goto discard_and_relse;
2220
2221 nf_reset_ct(skb);
2222
2223 drop_reason = tcp_filter(sk, skb);
2224 if (drop_reason)
2225 goto discard_and_relse;
2226
2227 th = (const struct tcphdr *)skb->data;
2228 iph = ip_hdr(skb);
2229 tcp_v4_fill_cb(skb, iph, th);
2230
2231 skb->dev = NULL;
2232
2233 if (sk->sk_state == TCP_LISTEN) {
2234 ret = tcp_v4_do_rcv(sk, skb);
2235 goto put_and_return;
2236 }
2237
2238 sk_incoming_cpu_update(sk);
2239
2240 bh_lock_sock_nested(sk);
2241 tcp_segs_in(tcp_sk(sk), skb);
2242 ret = 0;
2243 if (!sock_owned_by_user(sk)) {
2244 ret = tcp_v4_do_rcv(sk, skb);
2245 } else {
2246 drop_reason = tcp_add_backlog(sk, skb);
2247 if (drop_reason)
2248 goto discard_and_relse;
2249 }
2250 bh_unlock_sock(sk);
2251
2252 put_and_return:
2253 if (refcounted)
2254 sock_put(sk);
2255
2256 return ret;
2257
2258 no_tcp_socket:
2259 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2260 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2261 goto discard_it;
2262
2263 tcp_v4_fill_cb(skb, iph, th);
2264
2265 if (tcp_checksum_complete(skb)) {
2266 csum_error:
2267 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2268 trace_tcp_bad_csum(skb);
2269 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2270 bad_packet:
2271 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2272 } else {
2273 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2274 }
2275
2276 discard_it:
2277 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2278 /* Discard frame. */
2279 sk_skb_reason_drop(sk, skb, drop_reason);
2280 return 0;
2281
2282 discard_and_relse:
2283 sk_drops_skbadd(sk, skb);
2284 if (refcounted)
2285 sock_put(sk);
2286 goto discard_it;
2287
2288 do_time_wait:
2289 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2290 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2291 inet_twsk_put(inet_twsk(sk));
2292 goto discard_it;
2293 }
2294
2295 tcp_v4_fill_cb(skb, iph, th);
2296
2297 if (tcp_checksum_complete(skb)) {
2298 inet_twsk_put(inet_twsk(sk));
2299 goto csum_error;
2300 }
2301
2302 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2303 &drop_reason);
2304 switch (tw_status) {
2305 case TCP_TW_SYN: {
2306 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2307 iph->saddr, th->source,
2308 iph->daddr, th->dest,
2309 inet_iif(skb),
2310 sdif);
2311 if (sk2) {
2312 inet_twsk_deschedule_put(inet_twsk(sk));
2313 sk = sk2;
2314 tcp_v4_restore_cb(skb);
2315 refcounted = false;
2316 __this_cpu_write(tcp_tw_isn, isn);
2317 goto process;
2318 }
2319
2320 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2321 if (drop_reason)
2322 break;
2323 }
2324 /* to ACK */
2325 fallthrough;
2326 case TCP_TW_ACK:
2327 case TCP_TW_ACK_OOW:
2328 tcp_v4_timewait_ack(sk, skb, tw_status);
2329 break;
2330 case TCP_TW_RST:
2331 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2332 inet_twsk_deschedule_put(inet_twsk(sk));
2333 goto discard_it;
2334 case TCP_TW_SUCCESS:;
2335 }
2336 goto discard_it;
2337 }
2338
2339 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2340 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2341 };
2342
2343 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2344 {
2345 struct dst_entry *dst = skb_dst(skb);
2346
2347 if (dst && dst_hold_safe(dst)) {
2348 rcu_assign_pointer(sk->sk_rx_dst, dst);
2349 sk->sk_rx_dst_ifindex = skb->skb_iif;
2350 }
2351 }
2352
2353 const struct inet_connection_sock_af_ops ipv4_specific = {
2354 .queue_xmit = ip_queue_xmit,
2355 .rebuild_header = inet_sk_rebuild_header,
2356 .sk_rx_dst_set = inet_sk_rx_dst_set,
2357 .conn_request = tcp_v4_conn_request,
2358 .syn_recv_sock = tcp_v4_syn_recv_sock,
2359 .net_header_len = sizeof(struct iphdr),
2360 .setsockopt = ip_setsockopt,
2361 .getsockopt = ip_getsockopt,
2362 .mtu_reduced = tcp_v4_mtu_reduced,
2363 };
2364
2365 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2366 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2367 #ifdef CONFIG_TCP_MD5SIG
2368 .md5_lookup = tcp_v4_md5_lookup,
2369 .calc_md5_hash = tcp_v4_md5_hash_skb,
2370 .md5_parse = tcp_v4_parse_md5_keys,
2371 #endif
2372 #ifdef CONFIG_TCP_AO
2373 .ao_lookup = tcp_v4_ao_lookup,
2374 .calc_ao_hash = tcp_v4_ao_hash_skb,
2375 .ao_parse = tcp_v4_parse_ao,
2376 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2377 #endif
2378 };
2379
2380 static void tcp4_destruct_sock(struct sock *sk)
2381 {
2382 tcp_md5_destruct_sock(sk);
2383 tcp_ao_destroy_sock(sk, false);
2384 inet_sock_destruct(sk);
2385 }
2386 #endif
2387
2388 /* NOTE: A lot of things set to zero explicitly by call to
2389 * sk_alloc() so need not be done here.
2390 */
2391 static int tcp_v4_init_sock(struct sock *sk)
2392 {
2393 struct inet_connection_sock *icsk = inet_csk(sk);
2394
2395 tcp_init_sock(sk);
2396
2397 icsk->icsk_af_ops = &ipv4_specific;
2398
2399 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2400 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2401 sk->sk_destruct = tcp4_destruct_sock;
2402 #endif
2403
2404 return 0;
2405 }
2406
2407 static void tcp_release_user_frags(struct sock *sk)
2408 {
2409 #ifdef CONFIG_PAGE_POOL
2410 unsigned long index;
2411 void *netmem;
2412
2413 xa_for_each(&sk->sk_user_frags, index, netmem)
2414 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2415 #endif
2416 }
2417
2418 void tcp_v4_destroy_sock(struct sock *sk)
2419 {
2420 struct tcp_sock *tp = tcp_sk(sk);
2421
2422 tcp_release_user_frags(sk);
2423
2424 xa_destroy(&sk->sk_user_frags);
2425
2426 trace_tcp_destroy_sock(sk);
2427
2428 tcp_clear_xmit_timers(sk);
2429
2430 tcp_cleanup_congestion_control(sk);
2431
2432 tcp_cleanup_ulp(sk);
2433
2434 /* Cleanup up the write buffer. */
2435 tcp_write_queue_purge(sk);
2436
2437 /* Check if we want to disable active TFO */
2438 tcp_fastopen_active_disable_ofo_check(sk);
2439
2440 /* Cleans up our, hopefully empty, out_of_order_queue. */
2441 skb_rbtree_purge(&tp->out_of_order_queue);
2442
2443 /* Clean up a referenced TCP bind bucket. */
2444 if (inet_csk(sk)->icsk_bind_hash)
2445 inet_put_port(sk);
2446
2447 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2448
2449 /* If socket is aborted during connect operation */
2450 tcp_free_fastopen_req(tp);
2451 tcp_fastopen_destroy_cipher(sk);
2452 tcp_saved_syn_free(tp);
2453
2454 sk_sockets_allocated_dec(sk);
2455 }
2456
2457 #ifdef CONFIG_PROC_FS
2458 /* Proc filesystem TCP sock list dumping. */
2459
2460 static unsigned short seq_file_family(const struct seq_file *seq);
2461
2462 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2463 {
2464 unsigned short family = seq_file_family(seq);
2465
2466 /* AF_UNSPEC is used as a match all */
2467 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2468 net_eq(sock_net(sk), seq_file_net(seq)));
2469 }
2470
2471 /* Find a non empty bucket (starting from st->bucket)
2472 * and return the first sk from it.
2473 */
2474 static void *listening_get_first(struct seq_file *seq)
2475 {
2476 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2477 struct tcp_iter_state *st = seq->private;
2478
2479 st->offset = 0;
2480 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2481 struct inet_listen_hashbucket *ilb2;
2482 struct hlist_nulls_node *node;
2483 struct sock *sk;
2484
2485 ilb2 = &hinfo->lhash2[st->bucket];
2486 if (hlist_nulls_empty(&ilb2->nulls_head))
2487 continue;
2488
2489 spin_lock(&ilb2->lock);
2490 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2491 if (seq_sk_match(seq, sk))
2492 return sk;
2493 }
2494 spin_unlock(&ilb2->lock);
2495 }
2496
2497 return NULL;
2498 }
2499
2500 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2501 * If "cur" is the last one in the st->bucket,
2502 * call listening_get_first() to return the first sk of the next
2503 * non empty bucket.
2504 */
2505 static void *listening_get_next(struct seq_file *seq, void *cur)
2506 {
2507 struct tcp_iter_state *st = seq->private;
2508 struct inet_listen_hashbucket *ilb2;
2509 struct hlist_nulls_node *node;
2510 struct inet_hashinfo *hinfo;
2511 struct sock *sk = cur;
2512
2513 ++st->num;
2514 ++st->offset;
2515
2516 sk = sk_nulls_next(sk);
2517 sk_nulls_for_each_from(sk, node) {
2518 if (seq_sk_match(seq, sk))
2519 return sk;
2520 }
2521
2522 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2523 ilb2 = &hinfo->lhash2[st->bucket];
2524 spin_unlock(&ilb2->lock);
2525 ++st->bucket;
2526 return listening_get_first(seq);
2527 }
2528
2529 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2530 {
2531 struct tcp_iter_state *st = seq->private;
2532 void *rc;
2533
2534 st->bucket = 0;
2535 st->offset = 0;
2536 rc = listening_get_first(seq);
2537
2538 while (rc && *pos) {
2539 rc = listening_get_next(seq, rc);
2540 --*pos;
2541 }
2542 return rc;
2543 }
2544
2545 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2546 const struct tcp_iter_state *st)
2547 {
2548 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2549 }
2550
2551 /*
2552 * Get first established socket starting from bucket given in st->bucket.
2553 * If st->bucket is zero, the very first socket in the hash is returned.
2554 */
2555 static void *established_get_first(struct seq_file *seq)
2556 {
2557 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2558 struct tcp_iter_state *st = seq->private;
2559
2560 st->offset = 0;
2561 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2562 struct sock *sk;
2563 struct hlist_nulls_node *node;
2564 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2565
2566 cond_resched();
2567
2568 /* Lockless fast path for the common case of empty buckets */
2569 if (empty_bucket(hinfo, st))
2570 continue;
2571
2572 spin_lock_bh(lock);
2573 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2574 if (seq_sk_match(seq, sk))
2575 return sk;
2576 }
2577 spin_unlock_bh(lock);
2578 }
2579
2580 return NULL;
2581 }
2582
2583 static void *established_get_next(struct seq_file *seq, void *cur)
2584 {
2585 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2586 struct tcp_iter_state *st = seq->private;
2587 struct hlist_nulls_node *node;
2588 struct sock *sk = cur;
2589
2590 ++st->num;
2591 ++st->offset;
2592
2593 sk = sk_nulls_next(sk);
2594
2595 sk_nulls_for_each_from(sk, node) {
2596 if (seq_sk_match(seq, sk))
2597 return sk;
2598 }
2599
2600 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2601 ++st->bucket;
2602 return established_get_first(seq);
2603 }
2604
2605 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2606 {
2607 struct tcp_iter_state *st = seq->private;
2608 void *rc;
2609
2610 st->bucket = 0;
2611 rc = established_get_first(seq);
2612
2613 while (rc && pos) {
2614 rc = established_get_next(seq, rc);
2615 --pos;
2616 }
2617 return rc;
2618 }
2619
2620 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2621 {
2622 void *rc;
2623 struct tcp_iter_state *st = seq->private;
2624
2625 st->state = TCP_SEQ_STATE_LISTENING;
2626 rc = listening_get_idx(seq, &pos);
2627
2628 if (!rc) {
2629 st->state = TCP_SEQ_STATE_ESTABLISHED;
2630 rc = established_get_idx(seq, pos);
2631 }
2632
2633 return rc;
2634 }
2635
2636 static void *tcp_seek_last_pos(struct seq_file *seq)
2637 {
2638 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2639 struct tcp_iter_state *st = seq->private;
2640 int bucket = st->bucket;
2641 int offset = st->offset;
2642 int orig_num = st->num;
2643 void *rc = NULL;
2644
2645 switch (st->state) {
2646 case TCP_SEQ_STATE_LISTENING:
2647 if (st->bucket > hinfo->lhash2_mask)
2648 break;
2649 rc = listening_get_first(seq);
2650 while (offset-- && rc && bucket == st->bucket)
2651 rc = listening_get_next(seq, rc);
2652 if (rc)
2653 break;
2654 st->bucket = 0;
2655 st->state = TCP_SEQ_STATE_ESTABLISHED;
2656 fallthrough;
2657 case TCP_SEQ_STATE_ESTABLISHED:
2658 if (st->bucket > hinfo->ehash_mask)
2659 break;
2660 rc = established_get_first(seq);
2661 while (offset-- && rc && bucket == st->bucket)
2662 rc = established_get_next(seq, rc);
2663 }
2664
2665 st->num = orig_num;
2666
2667 return rc;
2668 }
2669
2670 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2671 {
2672 struct tcp_iter_state *st = seq->private;
2673 void *rc;
2674
2675 if (*pos && *pos == st->last_pos) {
2676 rc = tcp_seek_last_pos(seq);
2677 if (rc)
2678 goto out;
2679 }
2680
2681 st->state = TCP_SEQ_STATE_LISTENING;
2682 st->num = 0;
2683 st->bucket = 0;
2684 st->offset = 0;
2685 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2686
2687 out:
2688 st->last_pos = *pos;
2689 return rc;
2690 }
2691
2692 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2693 {
2694 struct tcp_iter_state *st = seq->private;
2695 void *rc = NULL;
2696
2697 if (v == SEQ_START_TOKEN) {
2698 rc = tcp_get_idx(seq, 0);
2699 goto out;
2700 }
2701
2702 switch (st->state) {
2703 case TCP_SEQ_STATE_LISTENING:
2704 rc = listening_get_next(seq, v);
2705 if (!rc) {
2706 st->state = TCP_SEQ_STATE_ESTABLISHED;
2707 st->bucket = 0;
2708 st->offset = 0;
2709 rc = established_get_first(seq);
2710 }
2711 break;
2712 case TCP_SEQ_STATE_ESTABLISHED:
2713 rc = established_get_next(seq, v);
2714 break;
2715 }
2716 out:
2717 ++*pos;
2718 st->last_pos = *pos;
2719 return rc;
2720 }
2721
2722 void tcp_seq_stop(struct seq_file *seq, void *v)
2723 {
2724 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2725 struct tcp_iter_state *st = seq->private;
2726
2727 switch (st->state) {
2728 case TCP_SEQ_STATE_LISTENING:
2729 if (v != SEQ_START_TOKEN)
2730 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2731 break;
2732 case TCP_SEQ_STATE_ESTABLISHED:
2733 if (v)
2734 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2735 break;
2736 }
2737 }
2738
2739 static void get_openreq4(const struct request_sock *req,
2740 struct seq_file *f, int i)
2741 {
2742 const struct inet_request_sock *ireq = inet_rsk(req);
2743 long delta = req->rsk_timer.expires - jiffies;
2744
2745 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2746 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2747 i,
2748 ireq->ir_loc_addr,
2749 ireq->ir_num,
2750 ireq->ir_rmt_addr,
2751 ntohs(ireq->ir_rmt_port),
2752 TCP_SYN_RECV,
2753 0, 0, /* could print option size, but that is af dependent. */
2754 1, /* timers active (only the expire timer) */
2755 jiffies_delta_to_clock_t(delta),
2756 req->num_timeout,
2757 from_kuid_munged(seq_user_ns(f),
2758 sk_uid(req->rsk_listener)),
2759 0, /* non standard timer */
2760 0, /* open_requests have no inode */
2761 0,
2762 req);
2763 }
2764
2765 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2766 {
2767 int timer_active;
2768 unsigned long timer_expires;
2769 const struct tcp_sock *tp = tcp_sk(sk);
2770 const struct inet_connection_sock *icsk = inet_csk(sk);
2771 const struct inet_sock *inet = inet_sk(sk);
2772 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2773 __be32 dest = inet->inet_daddr;
2774 __be32 src = inet->inet_rcv_saddr;
2775 __u16 destp = ntohs(inet->inet_dport);
2776 __u16 srcp = ntohs(inet->inet_sport);
2777 u8 icsk_pending;
2778 int rx_queue;
2779 int state;
2780
2781 icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2782 if (icsk_pending == ICSK_TIME_RETRANS ||
2783 icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2784 icsk_pending == ICSK_TIME_LOSS_PROBE) {
2785 timer_active = 1;
2786 timer_expires = tcp_timeout_expires(sk);
2787 } else if (icsk_pending == ICSK_TIME_PROBE0) {
2788 timer_active = 4;
2789 timer_expires = tcp_timeout_expires(sk);
2790 } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
2791 timer_active = 2;
2792 timer_expires = icsk->icsk_keepalive_timer.expires;
2793 } else {
2794 timer_active = 0;
2795 timer_expires = jiffies;
2796 }
2797
2798 state = inet_sk_state_load(sk);
2799 if (state == TCP_LISTEN)
2800 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2801 else
2802 /* Because we don't lock the socket,
2803 * we might find a transient negative value.
2804 */
2805 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2806 READ_ONCE(tp->copied_seq), 0);
2807
2808 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2809 "%08X %5u %8d %llu %d %pK %lu %lu %u %u %d",
2810 i, src, srcp, dest, destp, state,
2811 READ_ONCE(tp->write_seq) - tp->snd_una,
2812 rx_queue,
2813 timer_active,
2814 jiffies_delta_to_clock_t(timer_expires - jiffies),
2815 READ_ONCE(icsk->icsk_retransmits),
2816 from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2817 READ_ONCE(icsk->icsk_probes_out),
2818 sock_i_ino(sk),
2819 refcount_read(&sk->sk_refcnt), sk,
2820 jiffies_to_clock_t(icsk->icsk_rto),
2821 jiffies_to_clock_t(icsk->icsk_ack.ato),
2822 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2823 tcp_snd_cwnd(tp),
2824 state == TCP_LISTEN ?
2825 fastopenq->max_qlen :
2826 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2827 }
2828
2829 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2830 struct seq_file *f, int i)
2831 {
2832 long delta = tw->tw_timer.expires - jiffies;
2833 __be32 dest, src;
2834 __u16 destp, srcp;
2835
2836 dest = tw->tw_daddr;
2837 src = tw->tw_rcv_saddr;
2838 destp = ntohs(tw->tw_dport);
2839 srcp = ntohs(tw->tw_sport);
2840
2841 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2842 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2843 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2844 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2845 refcount_read(&tw->tw_refcnt), tw);
2846 }
2847
2848 #define TMPSZ 150
2849
2850 static int tcp4_seq_show(struct seq_file *seq, void *v)
2851 {
2852 struct tcp_iter_state *st;
2853 struct sock *sk = v;
2854
2855 seq_setwidth(seq, TMPSZ - 1);
2856 if (v == SEQ_START_TOKEN) {
2857 seq_puts(seq, " sl local_address rem_address st tx_queue "
2858 "rx_queue tr tm->when retrnsmt uid timeout "
2859 "inode");
2860 goto out;
2861 }
2862 st = seq->private;
2863
2864 if (sk->sk_state == TCP_TIME_WAIT)
2865 get_timewait4_sock(v, seq, st->num);
2866 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2867 get_openreq4(v, seq, st->num);
2868 else
2869 get_tcp4_sock(v, seq, st->num);
2870 out:
2871 seq_pad(seq, '\n');
2872 return 0;
2873 }
2874
2875 #ifdef CONFIG_BPF_SYSCALL
2876 union bpf_tcp_iter_batch_item {
2877 struct sock *sk;
2878 __u64 cookie;
2879 };
2880
2881 struct bpf_tcp_iter_state {
2882 struct tcp_iter_state state;
2883 unsigned int cur_sk;
2884 unsigned int end_sk;
2885 unsigned int max_sk;
2886 union bpf_tcp_iter_batch_item *batch;
2887 };
2888
2889 struct bpf_iter__tcp {
2890 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2891 __bpf_md_ptr(struct sock_common *, sk_common);
2892 uid_t uid __aligned(8);
2893 };
2894
2895 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2896 struct sock_common *sk_common, uid_t uid)
2897 {
2898 struct bpf_iter__tcp ctx;
2899
2900 meta->seq_num--; /* skip SEQ_START_TOKEN */
2901 ctx.meta = meta;
2902 ctx.sk_common = sk_common;
2903 ctx.uid = uid;
2904 return bpf_iter_run_prog(prog, &ctx);
2905 }
2906
2907 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2908 {
2909 union bpf_tcp_iter_batch_item *item;
2910 unsigned int cur_sk = iter->cur_sk;
2911 __u64 cookie;
2912
2913 /* Remember the cookies of the sockets we haven't seen yet, so we can
2914 * pick up where we left off next time around.
2915 */
2916 while (cur_sk < iter->end_sk) {
2917 item = &iter->batch[cur_sk++];
2918 cookie = sock_gen_cookie(item->sk);
2919 sock_gen_put(item->sk);
2920 item->cookie = cookie;
2921 }
2922 }
2923
2924 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2925 unsigned int new_batch_sz, gfp_t flags)
2926 {
2927 union bpf_tcp_iter_batch_item *new_batch;
2928
2929 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2930 flags | __GFP_NOWARN);
2931 if (!new_batch)
2932 return -ENOMEM;
2933
2934 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
2935 kvfree(iter->batch);
2936 iter->batch = new_batch;
2937 iter->max_sk = new_batch_sz;
2938
2939 return 0;
2940 }
2941
2942 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
2943 union bpf_tcp_iter_batch_item *cookies,
2944 int n_cookies)
2945 {
2946 struct hlist_nulls_node *node;
2947 struct sock *sk;
2948 int i;
2949
2950 for (i = 0; i < n_cookies; i++) {
2951 sk = first_sk;
2952 sk_nulls_for_each_from(sk, node)
2953 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
2954 return sk;
2955 }
2956
2957 return NULL;
2958 }
2959
2960 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
2961 {
2962 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2963 struct bpf_tcp_iter_state *iter = seq->private;
2964 struct tcp_iter_state *st = &iter->state;
2965 unsigned int find_cookie = iter->cur_sk;
2966 unsigned int end_cookie = iter->end_sk;
2967 int resume_bucket = st->bucket;
2968 struct sock *sk;
2969
2970 if (end_cookie && find_cookie == end_cookie)
2971 ++st->bucket;
2972
2973 sk = listening_get_first(seq);
2974 iter->cur_sk = 0;
2975 iter->end_sk = 0;
2976
2977 if (sk && st->bucket == resume_bucket && end_cookie) {
2978 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
2979 end_cookie - find_cookie);
2980 if (!sk) {
2981 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2982 ++st->bucket;
2983 sk = listening_get_first(seq);
2984 }
2985 }
2986
2987 return sk;
2988 }
2989
2990 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
2991 {
2992 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2993 struct bpf_tcp_iter_state *iter = seq->private;
2994 struct tcp_iter_state *st = &iter->state;
2995 unsigned int find_cookie = iter->cur_sk;
2996 unsigned int end_cookie = iter->end_sk;
2997 int resume_bucket = st->bucket;
2998 struct sock *sk;
2999
3000 if (end_cookie && find_cookie == end_cookie)
3001 ++st->bucket;
3002
3003 sk = established_get_first(seq);
3004 iter->cur_sk = 0;
3005 iter->end_sk = 0;
3006
3007 if (sk && st->bucket == resume_bucket && end_cookie) {
3008 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3009 end_cookie - find_cookie);
3010 if (!sk) {
3011 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3012 ++st->bucket;
3013 sk = established_get_first(seq);
3014 }
3015 }
3016
3017 return sk;
3018 }
3019
3020 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3021 {
3022 struct bpf_tcp_iter_state *iter = seq->private;
3023 struct tcp_iter_state *st = &iter->state;
3024 struct sock *sk = NULL;
3025
3026 switch (st->state) {
3027 case TCP_SEQ_STATE_LISTENING:
3028 sk = bpf_iter_tcp_resume_listening(seq);
3029 if (sk)
3030 break;
3031 st->bucket = 0;
3032 st->state = TCP_SEQ_STATE_ESTABLISHED;
3033 fallthrough;
3034 case TCP_SEQ_STATE_ESTABLISHED:
3035 sk = bpf_iter_tcp_resume_established(seq);
3036 break;
3037 }
3038
3039 return sk;
3040 }
3041
3042 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3043 struct sock **start_sk)
3044 {
3045 struct bpf_tcp_iter_state *iter = seq->private;
3046 struct hlist_nulls_node *node;
3047 unsigned int expected = 1;
3048 struct sock *sk;
3049
3050 sock_hold(*start_sk);
3051 iter->batch[iter->end_sk++].sk = *start_sk;
3052
3053 sk = sk_nulls_next(*start_sk);
3054 *start_sk = NULL;
3055 sk_nulls_for_each_from(sk, node) {
3056 if (seq_sk_match(seq, sk)) {
3057 if (iter->end_sk < iter->max_sk) {
3058 sock_hold(sk);
3059 iter->batch[iter->end_sk++].sk = sk;
3060 } else if (!*start_sk) {
3061 /* Remember where we left off. */
3062 *start_sk = sk;
3063 }
3064 expected++;
3065 }
3066 }
3067
3068 return expected;
3069 }
3070
3071 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3072 struct sock **start_sk)
3073 {
3074 struct bpf_tcp_iter_state *iter = seq->private;
3075 struct hlist_nulls_node *node;
3076 unsigned int expected = 1;
3077 struct sock *sk;
3078
3079 sock_hold(*start_sk);
3080 iter->batch[iter->end_sk++].sk = *start_sk;
3081
3082 sk = sk_nulls_next(*start_sk);
3083 *start_sk = NULL;
3084 sk_nulls_for_each_from(sk, node) {
3085 if (seq_sk_match(seq, sk)) {
3086 if (iter->end_sk < iter->max_sk) {
3087 sock_hold(sk);
3088 iter->batch[iter->end_sk++].sk = sk;
3089 } else if (!*start_sk) {
3090 /* Remember where we left off. */
3091 *start_sk = sk;
3092 }
3093 expected++;
3094 }
3095 }
3096
3097 return expected;
3098 }
3099
3100 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3101 struct sock **start_sk)
3102 {
3103 struct bpf_tcp_iter_state *iter = seq->private;
3104 struct tcp_iter_state *st = &iter->state;
3105
3106 if (st->state == TCP_SEQ_STATE_LISTENING)
3107 return bpf_iter_tcp_listening_batch(seq, start_sk);
3108 else
3109 return bpf_iter_tcp_established_batch(seq, start_sk);
3110 }
3111
3112 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3113 {
3114 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3115 struct bpf_tcp_iter_state *iter = seq->private;
3116 struct tcp_iter_state *st = &iter->state;
3117
3118 if (st->state == TCP_SEQ_STATE_LISTENING)
3119 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3120 else
3121 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3122 }
3123
3124 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3125 {
3126 struct bpf_tcp_iter_state *iter = seq->private;
3127 unsigned int expected;
3128 struct sock *sk;
3129 int err;
3130
3131 sk = bpf_iter_tcp_resume(seq);
3132 if (!sk)
3133 return NULL; /* Done */
3134
3135 expected = bpf_iter_fill_batch(seq, &sk);
3136 if (likely(iter->end_sk == expected))
3137 goto done;
3138
3139 /* Batch size was too small. */
3140 bpf_iter_tcp_unlock_bucket(seq);
3141 bpf_iter_tcp_put_batch(iter);
3142 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3143 GFP_USER);
3144 if (err)
3145 return ERR_PTR(err);
3146
3147 sk = bpf_iter_tcp_resume(seq);
3148 if (!sk)
3149 return NULL; /* Done */
3150
3151 expected = bpf_iter_fill_batch(seq, &sk);
3152 if (likely(iter->end_sk == expected))
3153 goto done;
3154
3155 /* Batch size was still too small. Hold onto the lock while we try
3156 * again with a larger batch to make sure the current bucket's size
3157 * does not change in the meantime.
3158 */
3159 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3160 if (err) {
3161 bpf_iter_tcp_unlock_bucket(seq);
3162 return ERR_PTR(err);
3163 }
3164
3165 expected = bpf_iter_fill_batch(seq, &sk);
3166 WARN_ON_ONCE(iter->end_sk != expected);
3167 done:
3168 bpf_iter_tcp_unlock_bucket(seq);
3169 return iter->batch[0].sk;
3170 }
3171
3172 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3173 {
3174 /* bpf iter does not support lseek, so it always
3175 * continue from where it was stop()-ped.
3176 */
3177 if (*pos)
3178 return bpf_iter_tcp_batch(seq);
3179
3180 return SEQ_START_TOKEN;
3181 }
3182
3183 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3184 {
3185 struct bpf_tcp_iter_state *iter = seq->private;
3186 struct tcp_iter_state *st = &iter->state;
3187 struct sock *sk;
3188
3189 /* Whenever seq_next() is called, the iter->cur_sk is
3190 * done with seq_show(), so advance to the next sk in
3191 * the batch.
3192 */
3193 if (iter->cur_sk < iter->end_sk) {
3194 /* Keeping st->num consistent in tcp_iter_state.
3195 * bpf_iter_tcp does not use st->num.
3196 * meta.seq_num is used instead.
3197 */
3198 st->num++;
3199 sock_gen_put(iter->batch[iter->cur_sk++].sk);
3200 }
3201
3202 if (iter->cur_sk < iter->end_sk)
3203 sk = iter->batch[iter->cur_sk].sk;
3204 else
3205 sk = bpf_iter_tcp_batch(seq);
3206
3207 ++*pos;
3208 /* Keeping st->last_pos consistent in tcp_iter_state.
3209 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3210 */
3211 st->last_pos = *pos;
3212 return sk;
3213 }
3214
3215 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3216 {
3217 struct bpf_iter_meta meta;
3218 struct bpf_prog *prog;
3219 struct sock *sk = v;
3220 uid_t uid;
3221 int ret;
3222
3223 if (v == SEQ_START_TOKEN)
3224 return 0;
3225
3226 if (sk_fullsock(sk))
3227 lock_sock(sk);
3228
3229 if (unlikely(sk_unhashed(sk))) {
3230 ret = SEQ_SKIP;
3231 goto unlock;
3232 }
3233
3234 if (sk->sk_state == TCP_TIME_WAIT) {
3235 uid = 0;
3236 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3237 const struct request_sock *req = v;
3238
3239 uid = from_kuid_munged(seq_user_ns(seq),
3240 sk_uid(req->rsk_listener));
3241 } else {
3242 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3243 }
3244
3245 meta.seq = seq;
3246 prog = bpf_iter_get_info(&meta, false);
3247 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3248
3249 unlock:
3250 if (sk_fullsock(sk))
3251 release_sock(sk);
3252 return ret;
3253
3254 }
3255
3256 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3257 {
3258 struct bpf_tcp_iter_state *iter = seq->private;
3259 struct bpf_iter_meta meta;
3260 struct bpf_prog *prog;
3261
3262 if (!v) {
3263 meta.seq = seq;
3264 prog = bpf_iter_get_info(&meta, true);
3265 if (prog)
3266 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3267 }
3268
3269 if (iter->cur_sk < iter->end_sk)
3270 bpf_iter_tcp_put_batch(iter);
3271 }
3272
3273 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3274 .show = bpf_iter_tcp_seq_show,
3275 .start = bpf_iter_tcp_seq_start,
3276 .next = bpf_iter_tcp_seq_next,
3277 .stop = bpf_iter_tcp_seq_stop,
3278 };
3279 #endif
3280 static unsigned short seq_file_family(const struct seq_file *seq)
3281 {
3282 const struct tcp_seq_afinfo *afinfo;
3283
3284 #ifdef CONFIG_BPF_SYSCALL
3285 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3286 if (seq->op == &bpf_iter_tcp_seq_ops)
3287 return AF_UNSPEC;
3288 #endif
3289
3290 /* Iterated from proc fs */
3291 afinfo = pde_data(file_inode(seq->file));
3292 return afinfo->family;
3293 }
3294
3295 static const struct seq_operations tcp4_seq_ops = {
3296 .show = tcp4_seq_show,
3297 .start = tcp_seq_start,
3298 .next = tcp_seq_next,
3299 .stop = tcp_seq_stop,
3300 };
3301
3302 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3303 .family = AF_INET,
3304 };
3305
3306 static int __net_init tcp4_proc_init_net(struct net *net)
3307 {
3308 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3309 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3310 return -ENOMEM;
3311 return 0;
3312 }
3313
3314 static void __net_exit tcp4_proc_exit_net(struct net *net)
3315 {
3316 remove_proc_entry("tcp", net->proc_net);
3317 }
3318
3319 static struct pernet_operations tcp4_net_ops = {
3320 .init = tcp4_proc_init_net,
3321 .exit = tcp4_proc_exit_net,
3322 };
3323
3324 int __init tcp4_proc_init(void)
3325 {
3326 return register_pernet_subsys(&tcp4_net_ops);
3327 }
3328
3329 void tcp4_proc_exit(void)
3330 {
3331 unregister_pernet_subsys(&tcp4_net_ops);
3332 }
3333 #endif /* CONFIG_PROC_FS */
3334
3335 struct proto tcp_prot = {
3336 .name = "TCP",
3337 .owner = THIS_MODULE,
3338 .close = tcp_close,
3339 .pre_connect = tcp_v4_pre_connect,
3340 .connect = tcp_v4_connect,
3341 .disconnect = tcp_disconnect,
3342 .accept = inet_csk_accept,
3343 .ioctl = tcp_ioctl,
3344 .init = tcp_v4_init_sock,
3345 .destroy = tcp_v4_destroy_sock,
3346 .shutdown = tcp_shutdown,
3347 .setsockopt = tcp_setsockopt,
3348 .getsockopt = tcp_getsockopt,
3349 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3350 .keepalive = tcp_set_keepalive,
3351 .recvmsg = tcp_recvmsg,
3352 .sendmsg = tcp_sendmsg,
3353 .splice_eof = tcp_splice_eof,
3354 .backlog_rcv = tcp_v4_do_rcv,
3355 .release_cb = tcp_release_cb,
3356 .hash = inet_hash,
3357 .unhash = inet_unhash,
3358 .get_port = inet_csk_get_port,
3359 .put_port = inet_put_port,
3360 #ifdef CONFIG_BPF_SYSCALL
3361 .psock_update_sk_prot = tcp_bpf_update_proto,
3362 #endif
3363 .enter_memory_pressure = tcp_enter_memory_pressure,
3364 .leave_memory_pressure = tcp_leave_memory_pressure,
3365 .stream_memory_free = tcp_stream_memory_free,
3366 .sockets_allocated = &tcp_sockets_allocated,
3367
3368 .memory_allocated = &net_aligned_data.tcp_memory_allocated,
3369 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3370
3371 .memory_pressure = &tcp_memory_pressure,
3372 .sysctl_mem = sysctl_tcp_mem,
3373 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3374 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3375 .max_header = MAX_TCP_HEADER,
3376 .obj_size = sizeof(struct tcp_sock),
3377 .freeptr_offset = offsetof(struct tcp_sock,
3378 inet_conn.icsk_inet.sk.sk_freeptr),
3379 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3380 .twsk_prot = &tcp_timewait_sock_ops,
3381 .rsk_prot = &tcp_request_sock_ops,
3382 .h.hashinfo = NULL,
3383 .no_autobind = true,
3384 .diag_destroy = tcp_abort,
3385 };
3386 EXPORT_SYMBOL(tcp_prot);
3387
3388 static void __net_exit tcp_sk_exit(struct net *net)
3389 {
3390 if (net->ipv4.tcp_congestion_control)
3391 bpf_module_put(net->ipv4.tcp_congestion_control,
3392 net->ipv4.tcp_congestion_control->owner);
3393 }
3394
3395 static void __net_init tcp_set_hashinfo(struct net *net)
3396 {
3397 struct inet_hashinfo *hinfo;
3398 unsigned int ehash_entries;
3399 struct net *old_net;
3400
3401 if (net_eq(net, &init_net))
3402 goto fallback;
3403
3404 old_net = current->nsproxy->net_ns;
3405 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3406 if (!ehash_entries)
3407 goto fallback;
3408
3409 ehash_entries = roundup_pow_of_two(ehash_entries);
3410 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3411 if (!hinfo) {
3412 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3413 "for a netns, fallback to the global one\n",
3414 ehash_entries);
3415 fallback:
3416 hinfo = &tcp_hashinfo;
3417 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3418 }
3419
3420 net->ipv4.tcp_death_row.hashinfo = hinfo;
3421 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3422 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3423 }
3424
3425 static int __net_init tcp_sk_init(struct net *net)
3426 {
3427 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3428 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3429 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3430 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3431
3432 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3433 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3434 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3435 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3436 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3437
3438 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3439 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3440 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3441
3442 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3443 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3444 net->ipv4.sysctl_tcp_syncookies = 1;
3445 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3446 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3447 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3448 net->ipv4.sysctl_tcp_orphan_retries = 0;
3449 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3450 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3451 net->ipv4.sysctl_tcp_tw_reuse = 2;
3452 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3453 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3454
3455 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3456 tcp_set_hashinfo(net);
3457
3458 net->ipv4.sysctl_tcp_sack = 1;
3459 net->ipv4.sysctl_tcp_window_scaling = 1;
3460 net->ipv4.sysctl_tcp_timestamps = 1;
3461 net->ipv4.sysctl_tcp_early_retrans = 3;
3462 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3463 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3464 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3465 net->ipv4.sysctl_tcp_max_reordering = 300;
3466 net->ipv4.sysctl_tcp_dsack = 1;
3467 net->ipv4.sysctl_tcp_app_win = 31;
3468 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3469 net->ipv4.sysctl_tcp_frto = 2;
3470 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3471 net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3472 /* This limits the percentage of the congestion window which we
3473 * will allow a single TSO frame to consume. Building TSO frames
3474 * which are too large can cause TCP streams to be bursty.
3475 */
3476 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3477 /* Default TSQ limit of 4 MB */
3478 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3479
3480 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3481 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3482
3483 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3484 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3485 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3486 net->ipv4.sysctl_tcp_autocorking = 1;
3487 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3488 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3489 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3490 if (net != &init_net) {
3491 memcpy(net->ipv4.sysctl_tcp_rmem,
3492 init_net.ipv4.sysctl_tcp_rmem,
3493 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3494 memcpy(net->ipv4.sysctl_tcp_wmem,
3495 init_net.ipv4.sysctl_tcp_wmem,
3496 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3497 }
3498 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3499 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
3500 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3501 net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
3502 net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3503 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3504 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3505 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3506
3507 /* Set default values for PLB */
3508 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3509 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3510 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3511 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3512 /* Default congestion threshold for PLB to mark a round is 50% */
3513 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3514
3515 /* Reno is always built in */
3516 if (!net_eq(net, &init_net) &&
3517 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3518 init_net.ipv4.tcp_congestion_control->owner))
3519 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3520 else
3521 net->ipv4.tcp_congestion_control = &tcp_reno;
3522
3523 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3524 net->ipv4.sysctl_tcp_shrink_window = 0;
3525
3526 net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3527 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3528 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3529
3530 return 0;
3531 }
3532
3533 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3534 {
3535 struct net *net;
3536
3537 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3538 * and failed setup_net error unwinding path are serialized.
3539 *
3540 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3541 * net_exit_list, the thread that dismantles a particular twsk must
3542 * do so without other thread progressing to refcount_dec_and_test() of
3543 * tcp_death_row.tw_refcount.
3544 */
3545 mutex_lock(&tcp_exit_batch_mutex);
3546
3547 tcp_twsk_purge(net_exit_list);
3548
3549 list_for_each_entry(net, net_exit_list, exit_list) {
3550 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3551 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3552 tcp_fastopen_ctx_destroy(net);
3553 }
3554
3555 mutex_unlock(&tcp_exit_batch_mutex);
3556 }
3557
3558 static struct pernet_operations __net_initdata tcp_sk_ops = {
3559 .init = tcp_sk_init,
3560 .exit = tcp_sk_exit,
3561 .exit_batch = tcp_sk_exit_batch,
3562 };
3563
3564 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3565 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3566 struct sock_common *sk_common, uid_t uid)
3567
3568 #define INIT_BATCH_SZ 16
3569
3570 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3571 {
3572 struct bpf_tcp_iter_state *iter = priv_data;
3573 int err;
3574
3575 err = bpf_iter_init_seq_net(priv_data, aux);
3576 if (err)
3577 return err;
3578
3579 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3580 if (err) {
3581 bpf_iter_fini_seq_net(priv_data);
3582 return err;
3583 }
3584
3585 return 0;
3586 }
3587
3588 static void bpf_iter_fini_tcp(void *priv_data)
3589 {
3590 struct bpf_tcp_iter_state *iter = priv_data;
3591
3592 bpf_iter_fini_seq_net(priv_data);
3593 kvfree(iter->batch);
3594 }
3595
3596 static const struct bpf_iter_seq_info tcp_seq_info = {
3597 .seq_ops = &bpf_iter_tcp_seq_ops,
3598 .init_seq_private = bpf_iter_init_tcp,
3599 .fini_seq_private = bpf_iter_fini_tcp,
3600 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3601 };
3602
3603 static const struct bpf_func_proto *
3604 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3605 const struct bpf_prog *prog)
3606 {
3607 switch (func_id) {
3608 case BPF_FUNC_setsockopt:
3609 return &bpf_sk_setsockopt_proto;
3610 case BPF_FUNC_getsockopt:
3611 return &bpf_sk_getsockopt_proto;
3612 default:
3613 return NULL;
3614 }
3615 }
3616
3617 static struct bpf_iter_reg tcp_reg_info = {
3618 .target = "tcp",
3619 .ctx_arg_info_size = 1,
3620 .ctx_arg_info = {
3621 { offsetof(struct bpf_iter__tcp, sk_common),
3622 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3623 },
3624 .get_func_proto = bpf_iter_tcp_get_func_proto,
3625 .seq_info = &tcp_seq_info,
3626 };
3627
3628 static void __init bpf_iter_register(void)
3629 {
3630 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3631 if (bpf_iter_reg_target(&tcp_reg_info))
3632 pr_warn("Warning: could not register bpf iterator tcp\n");
3633 }
3634
3635 #endif
3636
3637 void __init tcp_v4_init(void)
3638 {
3639 int cpu, res;
3640
3641 for_each_possible_cpu(cpu) {
3642 struct sock *sk;
3643
3644 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3645 IPPROTO_TCP, &init_net);
3646 if (res)
3647 panic("Failed to create the TCP control socket.\n");
3648 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3649
3650 /* Please enforce IP_DF and IPID==0 for RST and
3651 * ACK sent in SYN-RECV and TIME-WAIT state.
3652 */
3653 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3654
3655 sk->sk_clockid = CLOCK_MONOTONIC;
3656
3657 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3658 }
3659 if (register_pernet_subsys(&tcp_sk_ops))
3660 panic("Failed to create the TCP control socket.\n");
3661
3662 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3663 bpf_iter_register();
3664 #endif
3665 }
3666