1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/fips.h>
57 #include <linux/jhash.h>
58 #include <linux/init.h>
59 #include <linux/times.h>
60 #include <linux/slab.h>
61 #include <linux/sched.h>
62 #include <linux/sock_diag.h>
63
64 #include <net/aligned_data.h>
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/tcp_ecn.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/inet_ecn.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/busy_poll.h>
78 #include <net/rstreason.h>
79 #include <net/psp.h>
80
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86 #include <linux/inetdevice.h>
87 #include <linux/btf_ids.h>
88 #include <linux/skbuff_ref.h>
89
90 #include <crypto/md5.h>
91 #include <crypto/utils.h>
92
93 #include <trace/events/tcp.h>
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
97 __be32 daddr, __be32 saddr, const struct tcphdr *th);
98 #endif
99
100 struct inet_hashinfo tcp_hashinfo;
101
102 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
103 .bh_lock = INIT_LOCAL_LOCK(bh_lock),
104 };
105
106 static DEFINE_MUTEX(tcp_exit_batch_mutex);
107
108 INDIRECT_CALLABLE_SCOPE union tcp_seq_and_ts_off
tcp_v4_init_seq_and_ts_off(const struct net * net,const struct sk_buff * skb)109 tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
110 {
111 return secure_tcp_seq_and_ts_off(net,
112 ip_hdr(skb)->daddr,
113 ip_hdr(skb)->saddr,
114 tcp_hdr(skb)->dest,
115 tcp_hdr(skb)->source);
116 }
117
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
119 {
120 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
121 const struct inet_timewait_sock *tw = inet_twsk(sktw);
122 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
123 struct tcp_sock *tp = tcp_sk(sk);
124 int ts_recent_stamp;
125 u32 reuse_thresh;
126
127 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
128 reuse = 0;
129
130 if (reuse == 2) {
131 /* Still does not detect *everything* that goes through
132 * lo, since we require a loopback src or dst address
133 * or direct binding to 'lo' interface.
134 */
135 bool loopback = false;
136 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
137 loopback = true;
138 #if IS_ENABLED(CONFIG_IPV6)
139 if (tw->tw_family == AF_INET6) {
140 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
142 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
144 loopback = true;
145 } else
146 #endif
147 {
148 if (ipv4_is_loopback(tw->tw_daddr) ||
149 ipv4_is_loopback(tw->tw_rcv_saddr))
150 loopback = true;
151 }
152 if (!loopback)
153 reuse = 0;
154 }
155
156 /* With PAWS, it is safe from the viewpoint
157 of data integrity. Even without PAWS it is safe provided sequence
158 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
159
160 Actually, the idea is close to VJ's one, only timestamp cache is
161 held not per host, but per port pair and TW bucket is used as state
162 holder.
163
164 If TW bucket has been already destroyed we fall back to VJ's scheme
165 and use initial timestamp retrieved from peer table.
166 */
167 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
168 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
169 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
170 if (ts_recent_stamp &&
171 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
172 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
173 * and releasing the bucket lock.
174 */
175 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
176 return 0;
177
178 /* In case of repair and re-using TIME-WAIT sockets we still
179 * want to be sure that it is safe as above but honor the
180 * sequence numbers and time stamps set as part of the repair
181 * process.
182 *
183 * Without this check re-using a TIME-WAIT socket with TCP
184 * repair would accumulate a -1 on the repair assigned
185 * sequence number. The first time it is reused the sequence
186 * is -1, the second time -2, etc. This fixes that issue
187 * without appearing to create any others.
188 */
189 if (likely(!tp->repair)) {
190 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
191
192 if (!seq)
193 seq = 1;
194 WRITE_ONCE(tp->write_seq, seq);
195 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
196 tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
197 }
198
199 return 1;
200 }
201
202 return 0;
203 }
204
tcp_v4_pre_connect(struct sock * sk,struct sockaddr_unsized * uaddr,int addr_len)205 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
206 int addr_len)
207 {
208 /* This check is replicated from tcp_v4_connect() and intended to
209 * prevent BPF program called below from accessing bytes that are out
210 * of the bound specified by user in addr_len.
211 */
212 if (addr_len < sizeof(struct sockaddr_in))
213 return -EINVAL;
214
215 sock_owned_by_me(sk);
216
217 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
218 }
219
220 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr_unsized * uaddr,int addr_len)221 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
222 {
223 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
224 struct inet_timewait_death_row *tcp_death_row;
225 struct inet_sock *inet = inet_sk(sk);
226 struct tcp_sock *tp = tcp_sk(sk);
227 struct ip_options_rcu *inet_opt;
228 struct net *net = sock_net(sk);
229 __be16 orig_sport, orig_dport;
230 __be32 daddr, nexthop;
231 struct flowi4 *fl4;
232 struct rtable *rt;
233 int err;
234
235 if (addr_len < sizeof(struct sockaddr_in))
236 return -EINVAL;
237
238 if (usin->sin_family != AF_INET)
239 return -EAFNOSUPPORT;
240
241 nexthop = daddr = usin->sin_addr.s_addr;
242 inet_opt = rcu_dereference_protected(inet->inet_opt,
243 lockdep_sock_is_held(sk));
244 if (inet_opt && inet_opt->opt.srr) {
245 if (!daddr)
246 return -EINVAL;
247 nexthop = inet_opt->opt.faddr;
248 }
249
250 orig_sport = inet->inet_sport;
251 orig_dport = usin->sin_port;
252 fl4 = &inet->cork.fl.u.ip4;
253 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
254 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
255 orig_dport, sk);
256 if (IS_ERR(rt)) {
257 err = PTR_ERR(rt);
258 if (err == -ENETUNREACH)
259 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
260 return err;
261 }
262
263 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
264 ip_rt_put(rt);
265 return -ENETUNREACH;
266 }
267
268 if (!inet_opt || !inet_opt->opt.srr)
269 daddr = fl4->daddr;
270
271 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
272
273 if (!inet->inet_saddr) {
274 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
275 if (err) {
276 ip_rt_put(rt);
277 return err;
278 }
279 } else {
280 sk_rcv_saddr_set(sk, inet->inet_saddr);
281 }
282
283 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
284 /* Reset inherited state */
285 tp->rx_opt.ts_recent = 0;
286 tp->rx_opt.ts_recent_stamp = 0;
287 if (likely(!tp->repair))
288 WRITE_ONCE(tp->write_seq, 0);
289 }
290
291 inet->inet_dport = usin->sin_port;
292 sk_daddr_set(sk, daddr);
293
294 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
295 if (inet_opt)
296 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
297
298 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
299
300 /* Socket identity is still unknown (sport may be zero).
301 * However we set state to SYN-SENT and not releasing socket
302 * lock select source port, enter ourselves into the hash tables and
303 * complete initialization after this.
304 */
305 tcp_set_state(sk, TCP_SYN_SENT);
306 err = inet_hash_connect(tcp_death_row, sk);
307 if (err)
308 goto failure;
309
310 sk_set_txhash(sk);
311
312 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
313 inet->inet_sport, inet->inet_dport, sk);
314 if (IS_ERR(rt)) {
315 err = PTR_ERR(rt);
316 rt = NULL;
317 goto failure;
318 }
319 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
320 /* OK, now commit destination to socket. */
321 sk->sk_gso_type = SKB_GSO_TCPV4;
322 sk_setup_caps(sk, &rt->dst);
323 rt = NULL;
324
325 if (likely(!tp->repair)) {
326 union tcp_seq_and_ts_off st;
327
328 st = secure_tcp_seq_and_ts_off(net,
329 inet->inet_saddr,
330 inet->inet_daddr,
331 inet->inet_sport,
332 usin->sin_port);
333 if (!tp->write_seq)
334 WRITE_ONCE(tp->write_seq, st.seq);
335 WRITE_ONCE(tp->tsoffset, st.ts_off);
336 }
337
338 atomic_set(&inet->inet_id, get_random_u16());
339
340 if (tcp_fastopen_defer_connect(sk, &err))
341 return err;
342 if (err)
343 goto failure;
344
345 err = tcp_connect(sk);
346
347 if (err)
348 goto failure;
349
350 return 0;
351
352 failure:
353 /*
354 * This unhashes the socket and releases the local port,
355 * if necessary.
356 */
357 tcp_set_state(sk, TCP_CLOSE);
358 inet_bhash2_reset_saddr(sk);
359 ip_rt_put(rt);
360 sk->sk_route_caps = 0;
361 inet->inet_dport = 0;
362 return err;
363 }
364
365 /*
366 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
367 * It can be called through tcp_release_cb() if socket was owned by user
368 * at the time tcp_v4_err() was called to handle ICMP message.
369 */
tcp_v4_mtu_reduced(struct sock * sk)370 void tcp_v4_mtu_reduced(struct sock *sk)
371 {
372 struct inet_sock *inet = inet_sk(sk);
373 struct dst_entry *dst;
374 u32 mtu, dmtu;
375
376 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
377 return;
378 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
379 dst = inet_csk_update_pmtu(sk, mtu);
380 if (!dst)
381 return;
382
383 /* Something is about to be wrong... Remember soft error
384 * for the case, if this connection will not able to recover.
385 */
386 dmtu = dst4_mtu(dst);
387 if (mtu < dmtu && ip_dont_fragment(sk, dst))
388 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
389
390 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
391 ip_sk_accept_pmtu(sk) &&
392 inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
393 tcp_sync_mss(sk, dmtu);
394
395 /* Resend the TCP packet because it's
396 * clear that the old packet has been
397 * dropped. This is the new "fast" path mtu
398 * discovery.
399 */
400 tcp_simple_retransmit(sk);
401 } /* else let the usual retransmit timer handle it */
402 }
403
do_redirect(struct sk_buff * skb,struct sock * sk)404 static void do_redirect(struct sk_buff *skb, struct sock *sk)
405 {
406 struct dst_entry *dst = __sk_dst_check(sk, 0);
407
408 if (dst)
409 dst->ops->redirect(dst, sk, skb);
410 }
411
412
413 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)414 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
415 {
416 struct request_sock *req = inet_reqsk(sk);
417 struct net *net = sock_net(sk);
418
419 /* ICMPs are not backlogged, hence we cannot get
420 * an established socket here.
421 */
422 if (seq != tcp_rsk(req)->snt_isn) {
423 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
424 } else if (abort) {
425 /*
426 * Still in SYN_RECV, just remove it silently.
427 * There is no good way to pass the error to the newly
428 * created socket, and POSIX does not want network
429 * errors returned from accept().
430 */
431 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
432 tcp_listendrop(req->rsk_listener);
433 }
434 reqsk_put(req);
435 }
436
437 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)438 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
439 {
440 struct inet_connection_sock *icsk = inet_csk(sk);
441 struct tcp_sock *tp = tcp_sk(sk);
442 struct sk_buff *skb;
443 s32 remaining;
444 u32 delta_us;
445
446 if (sock_owned_by_user(sk))
447 return;
448
449 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
450 !icsk->icsk_backoff)
451 return;
452
453 skb = tcp_rtx_queue_head(sk);
454 if (WARN_ON_ONCE(!skb))
455 return;
456
457 icsk->icsk_backoff--;
458 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
459 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
460
461 tcp_mstamp_refresh(tp);
462 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
463 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
464
465 if (remaining > 0) {
466 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
467 } else {
468 /* RTO revert clocked out retransmission.
469 * Will retransmit now.
470 */
471 tcp_retransmit_timer(sk);
472 }
473 }
474
475 /*
476 * This routine is called by the ICMP module when it gets some
477 * sort of error condition. If err < 0 then the socket should
478 * be closed and the error returned to the user. If err > 0
479 * it's just the icmp type << 8 | icmp code. After adjustment
480 * header points to the first 8 bytes of the tcp header. We need
481 * to find the appropriate port.
482 *
483 * The locking strategy used here is very "optimistic". When
484 * someone else accesses the socket the ICMP is just dropped
485 * and for some paths there is no check at all.
486 * A more general error queue to queue errors for later handling
487 * is probably better.
488 *
489 */
490
tcp_v4_err(struct sk_buff * skb,u32 info)491 int tcp_v4_err(struct sk_buff *skb, u32 info)
492 {
493 const struct iphdr *iph = (const struct iphdr *)skb->data;
494 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
495 struct net *net = dev_net_rcu(skb->dev);
496 const int type = icmp_hdr(skb)->type;
497 const int code = icmp_hdr(skb)->code;
498 struct request_sock *fastopen;
499 struct tcp_sock *tp;
500 u32 seq, snd_una;
501 struct sock *sk;
502 int err;
503
504 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
505 ntohs(th->source), inet_iif(skb), 0);
506 if (!sk) {
507 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
508 return -ENOENT;
509 }
510 if (sk->sk_state == TCP_TIME_WAIT) {
511 /* To increase the counter of ignored icmps for TCP-AO */
512 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
513 inet_twsk_put(inet_twsk(sk));
514 return 0;
515 }
516 seq = ntohl(th->seq);
517 if (sk->sk_state == TCP_NEW_SYN_RECV) {
518 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
519 type == ICMP_TIME_EXCEEDED ||
520 (type == ICMP_DEST_UNREACH &&
521 (code == ICMP_NET_UNREACH ||
522 code == ICMP_HOST_UNREACH)));
523 return 0;
524 }
525
526 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
527 sock_put(sk);
528 return 0;
529 }
530
531 bh_lock_sock(sk);
532 /* If too many ICMPs get dropped on busy
533 * servers this needs to be solved differently.
534 * We do take care of PMTU discovery (RFC1191) special case :
535 * we can receive locally generated ICMP messages while socket is held.
536 */
537 if (sock_owned_by_user(sk)) {
538 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
539 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
540 }
541 if (sk->sk_state == TCP_CLOSE)
542 goto out;
543
544 if (static_branch_unlikely(&ip4_min_ttl)) {
545 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
546 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
547 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
548 goto out;
549 }
550 }
551
552 tp = tcp_sk(sk);
553 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
554 fastopen = rcu_dereference(tp->fastopen_rsk);
555 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
556 if (sk->sk_state != TCP_LISTEN &&
557 !between(seq, snd_una, tp->snd_nxt)) {
558 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
559 goto out;
560 }
561
562 switch (type) {
563 case ICMP_REDIRECT:
564 if (!sock_owned_by_user(sk))
565 do_redirect(skb, sk);
566 goto out;
567 case ICMP_SOURCE_QUENCH:
568 /* Just silently ignore these. */
569 goto out;
570 case ICMP_PARAMETERPROB:
571 err = EPROTO;
572 break;
573 case ICMP_DEST_UNREACH:
574 if (code > NR_ICMP_UNREACH)
575 goto out;
576
577 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
578 /* We are not interested in TCP_LISTEN and open_requests
579 * (SYN-ACKs send out by Linux are always <576bytes so
580 * they should go through unfragmented).
581 */
582 if (sk->sk_state == TCP_LISTEN)
583 goto out;
584
585 WRITE_ONCE(tp->mtu_info, info);
586 if (!sock_owned_by_user(sk)) {
587 tcp_v4_mtu_reduced(sk);
588 } else {
589 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
590 sock_hold(sk);
591 }
592 goto out;
593 }
594
595 err = icmp_err_convert[code].errno;
596 /* check if this ICMP message allows revert of backoff.
597 * (see RFC 6069)
598 */
599 if (!fastopen &&
600 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
601 tcp_ld_RTO_revert(sk, seq);
602 break;
603 case ICMP_TIME_EXCEEDED:
604 err = EHOSTUNREACH;
605 break;
606 default:
607 goto out;
608 }
609
610 switch (sk->sk_state) {
611 case TCP_SYN_SENT:
612 case TCP_SYN_RECV:
613 /* Only in fast or simultaneous open. If a fast open socket is
614 * already accepted it is treated as a connected one below.
615 */
616 if (fastopen && !fastopen->sk)
617 break;
618
619 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
620
621 if (!sock_owned_by_user(sk))
622 tcp_done_with_error(sk, err);
623 else
624 WRITE_ONCE(sk->sk_err_soft, err);
625 goto out;
626 }
627
628 /* If we've already connected we will keep trying
629 * until we time out, or the user gives up.
630 *
631 * rfc1122 4.2.3.9 allows to consider as hard errors
632 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
633 * but it is obsoleted by pmtu discovery).
634 *
635 * Note, that in modern internet, where routing is unreliable
636 * and in each dark corner broken firewalls sit, sending random
637 * errors ordered by their masters even this two messages finally lose
638 * their original sense (even Linux sends invalid PORT_UNREACHs)
639 *
640 * Now we are in compliance with RFCs.
641 * --ANK (980905)
642 */
643
644 if (!sock_owned_by_user(sk) &&
645 inet_test_bit(RECVERR, sk)) {
646 WRITE_ONCE(sk->sk_err, err);
647 sk_error_report(sk);
648 } else { /* Only an error on timeout */
649 WRITE_ONCE(sk->sk_err_soft, err);
650 }
651
652 out:
653 bh_unlock_sock(sk);
654 sock_put(sk);
655 return 0;
656 }
657
658 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
659
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])660 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
661 const struct tcp_ao_hdr *aoh,
662 struct ip_reply_arg *arg, struct tcphdr *reply,
663 __be32 reply_options[REPLY_OPTIONS_LEN])
664 {
665 #ifdef CONFIG_TCP_AO
666 int sdif = tcp_v4_sdif(skb);
667 int dif = inet_iif(skb);
668 int l3index = sdif ? dif : 0;
669 bool allocated_traffic_key;
670 struct tcp_ao_key *key;
671 char *traffic_key;
672 bool drop = true;
673 u32 ao_sne = 0;
674 u8 keyid;
675
676 rcu_read_lock();
677 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
678 &key, &traffic_key, &allocated_traffic_key,
679 &keyid, &ao_sne))
680 goto out;
681
682 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
683 (aoh->rnext_keyid << 8) | keyid);
684 arg->iov[0].iov_len += tcp_ao_len_aligned(key);
685 reply->doff = arg->iov[0].iov_len / 4;
686
687 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
688 key, traffic_key,
689 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
690 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
691 reply, ao_sne))
692 goto out;
693 drop = false;
694 out:
695 rcu_read_unlock();
696 if (allocated_traffic_key)
697 kfree(traffic_key);
698 return drop;
699 #else
700 return true;
701 #endif
702 }
703
704 /*
705 * This routine will send an RST to the other tcp.
706 *
707 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
708 * for reset.
709 * Answer: if a packet caused RST, it is not for a socket
710 * existing in our system, if it is matched to a socket,
711 * it is just duplicate segment or bug in other side's TCP.
712 * So that we build reply only basing on parameters
713 * arrived with segment.
714 * Exception: precedence violation. We do not implement it in any case.
715 */
716
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)717 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
718 enum sk_rst_reason reason)
719 {
720 const struct tcphdr *th = tcp_hdr(skb);
721 struct {
722 struct tcphdr th;
723 __be32 opt[REPLY_OPTIONS_LEN];
724 } rep;
725 const __u8 *md5_hash_location = NULL;
726 const struct tcp_ao_hdr *aoh;
727 struct ip_reply_arg arg;
728 #ifdef CONFIG_TCP_MD5SIG
729 struct tcp_md5sig_key *key = NULL;
730 unsigned char newhash[16];
731 struct sock *sk1 = NULL;
732 #endif
733 u64 transmit_time = 0;
734 struct sock *ctl_sk;
735 struct net *net;
736 u32 txhash = 0;
737
738 /* Never send a reset in response to a reset. */
739 if (th->rst)
740 return;
741
742 /* If sk not NULL, it means we did a successful lookup and incoming
743 * route had to be correct. prequeue might have dropped our dst.
744 */
745 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
746 return;
747
748 /* Swap the send and the receive. */
749 memset(&rep, 0, sizeof(rep));
750 rep.th.dest = th->source;
751 rep.th.source = th->dest;
752 rep.th.doff = sizeof(struct tcphdr) / 4;
753 rep.th.rst = 1;
754
755 if (th->ack) {
756 rep.th.seq = th->ack_seq;
757 } else {
758 rep.th.ack = 1;
759 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
760 skb->len - (th->doff << 2));
761 }
762
763 memset(&arg, 0, sizeof(arg));
764 arg.iov[0].iov_base = (unsigned char *)&rep;
765 arg.iov[0].iov_len = sizeof(rep.th);
766
767 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
768
769 /* Invalid TCP option size or twice included auth */
770 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
771 return;
772
773 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
774 return;
775
776 #ifdef CONFIG_TCP_MD5SIG
777 rcu_read_lock();
778 if (sk && sk_fullsock(sk)) {
779 const union tcp_md5_addr *addr;
780 int l3index;
781
782 /* sdif set, means packet ingressed via a device
783 * in an L3 domain and inet_iif is set to it.
784 */
785 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
786 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
787 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
788 } else if (md5_hash_location) {
789 const union tcp_md5_addr *addr;
790 int sdif = tcp_v4_sdif(skb);
791 int dif = inet_iif(skb);
792 int l3index;
793
794 /*
795 * active side is lost. Try to find listening socket through
796 * source port, and then find md5 key through listening socket.
797 * we are not loose security here:
798 * Incoming packet is checked with md5 hash with finding key,
799 * no RST generated if md5 hash doesn't match.
800 */
801 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
802 th->source, ip_hdr(skb)->daddr,
803 ntohs(th->source), dif, sdif);
804 /* don't send rst if it can't find key */
805 if (!sk1)
806 goto out;
807
808 /* sdif set, means packet ingressed via a device
809 * in an L3 domain and dif is set to it.
810 */
811 l3index = sdif ? dif : 0;
812 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
813 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
814 if (!key)
815 goto out;
816
817 tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
818 if (crypto_memneq(md5_hash_location, newhash, 16))
819 goto out;
820 }
821
822 if (key) {
823 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
824 (TCPOPT_NOP << 16) |
825 (TCPOPT_MD5SIG << 8) |
826 TCPOLEN_MD5SIG);
827 /* Update length and the length the header thinks exists */
828 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
829 rep.th.doff = arg.iov[0].iov_len / 4;
830
831 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
832 key, ip_hdr(skb)->saddr,
833 ip_hdr(skb)->daddr, &rep.th);
834 }
835 #endif
836 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
837 if (rep.opt[0] == 0) {
838 __be32 mrst = mptcp_reset_option(skb);
839
840 if (mrst) {
841 rep.opt[0] = mrst;
842 arg.iov[0].iov_len += sizeof(mrst);
843 rep.th.doff = arg.iov[0].iov_len / 4;
844 }
845 }
846
847 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
848 ip_hdr(skb)->saddr, /* XXX */
849 arg.iov[0].iov_len, IPPROTO_TCP, 0);
850 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
851 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
852
853 /* When socket is gone, all binding information is lost.
854 * routing might fail in this case. No choice here, if we choose to force
855 * input interface, we will misroute in case of asymmetric route.
856 */
857 if (sk)
858 arg.bound_dev_if = sk->sk_bound_dev_if;
859
860 trace_tcp_send_reset(sk, skb, reason);
861
862 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
863 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
864
865 /* ECN bits of TW reset are cleared */
866 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
867 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
868 local_bh_disable();
869 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
870 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
871
872 sock_net_set(ctl_sk, net);
873 if (sk) {
874 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
875 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
876 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
877 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
878 transmit_time = tcp_transmit_time(sk);
879 xfrm_sk_clone_policy(ctl_sk, sk);
880 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
881 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
882 } else {
883 ctl_sk->sk_mark = 0;
884 ctl_sk->sk_priority = 0;
885 }
886 ip_send_unicast_reply(ctl_sk, sk,
887 skb, &TCP_SKB_CB(skb)->header.h4.opt,
888 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
889 &arg, arg.iov[0].iov_len,
890 transmit_time, txhash);
891
892 xfrm_sk_free_policy(ctl_sk);
893 sock_net_set(ctl_sk, &init_net);
894 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
895 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
896 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
897 local_bh_enable();
898
899 #ifdef CONFIG_TCP_MD5SIG
900 out:
901 rcu_read_unlock();
902 #endif
903 }
904
905 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
906 outside socket context is ugly, certainly. What can I do?
907 */
908
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)909 static void tcp_v4_send_ack(const struct sock *sk,
910 struct sk_buff *skb, u32 seq, u32 ack,
911 u32 win, u32 tsval, u32 tsecr, int oif,
912 struct tcp_key *key,
913 int reply_flags, u8 tos, u32 txhash)
914 {
915 const struct tcphdr *th = tcp_hdr(skb);
916 struct {
917 struct tcphdr th;
918 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
919 } rep;
920 struct net *net = sock_net(sk);
921 struct ip_reply_arg arg;
922 struct sock *ctl_sk;
923 u64 transmit_time;
924
925 memset(&rep.th, 0, sizeof(struct tcphdr));
926 memset(&arg, 0, sizeof(arg));
927
928 arg.iov[0].iov_base = (unsigned char *)&rep;
929 arg.iov[0].iov_len = sizeof(rep.th);
930 if (tsecr) {
931 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
932 (TCPOPT_TIMESTAMP << 8) |
933 TCPOLEN_TIMESTAMP);
934 rep.opt[1] = htonl(tsval);
935 rep.opt[2] = htonl(tsecr);
936 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
937 }
938
939 /* Swap the send and the receive. */
940 rep.th.dest = th->source;
941 rep.th.source = th->dest;
942 rep.th.doff = arg.iov[0].iov_len / 4;
943 rep.th.seq = htonl(seq);
944 rep.th.ack_seq = htonl(ack);
945 rep.th.ack = 1;
946 rep.th.window = htons(win);
947
948 #ifdef CONFIG_TCP_MD5SIG
949 if (tcp_key_is_md5(key)) {
950 int offset = (tsecr) ? 3 : 0;
951
952 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
953 (TCPOPT_NOP << 16) |
954 (TCPOPT_MD5SIG << 8) |
955 TCPOLEN_MD5SIG);
956 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
957 rep.th.doff = arg.iov[0].iov_len/4;
958
959 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
960 key->md5_key, ip_hdr(skb)->saddr,
961 ip_hdr(skb)->daddr, &rep.th);
962 }
963 #endif
964 #ifdef CONFIG_TCP_AO
965 if (tcp_key_is_ao(key)) {
966 int offset = (tsecr) ? 3 : 0;
967
968 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
969 (tcp_ao_len(key->ao_key) << 16) |
970 (key->ao_key->sndid << 8) |
971 key->rcv_next);
972 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
973 rep.th.doff = arg.iov[0].iov_len / 4;
974
975 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
976 key->ao_key, key->traffic_key,
977 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
978 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
979 &rep.th, key->sne);
980 }
981 #endif
982 arg.flags = reply_flags;
983 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
984 ip_hdr(skb)->saddr, /* XXX */
985 arg.iov[0].iov_len, IPPROTO_TCP, 0);
986 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
987 if (oif)
988 arg.bound_dev_if = oif;
989 arg.tos = tos;
990 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
991 local_bh_disable();
992 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
993 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
994 sock_net_set(ctl_sk, net);
995 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
996 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
997 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
998 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
999 transmit_time = tcp_transmit_time(sk);
1000 ip_send_unicast_reply(ctl_sk, sk,
1001 skb, &TCP_SKB_CB(skb)->header.h4.opt,
1002 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1003 &arg, arg.iov[0].iov_len,
1004 transmit_time, txhash);
1005
1006 sock_net_set(ctl_sk, &init_net);
1007 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1008 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1009 local_bh_enable();
1010 }
1011
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb,enum tcp_tw_status tw_status)1012 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1013 enum tcp_tw_status tw_status)
1014 {
1015 struct inet_timewait_sock *tw = inet_twsk(sk);
1016 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1017 struct tcp_key key = {};
1018 u8 tos = tw->tw_tos;
1019
1020 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1021 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1022 * being placed in a different service queues (Classic rather than L4S)
1023 */
1024 if (tw_status == TCP_TW_ACK_OOW)
1025 tos &= ~INET_ECN_MASK;
1026
1027 #ifdef CONFIG_TCP_AO
1028 struct tcp_ao_info *ao_info;
1029
1030 if (static_branch_unlikely(&tcp_ao_needed.key)) {
1031 /* FIXME: the segment to-be-acked is not verified yet */
1032 ao_info = rcu_dereference(tcptw->ao_info);
1033 if (ao_info) {
1034 const struct tcp_ao_hdr *aoh;
1035
1036 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1037 inet_twsk_put(tw);
1038 return;
1039 }
1040
1041 if (aoh)
1042 key.ao_key = tcp_ao_established_key(sk, ao_info,
1043 aoh->rnext_keyid, -1);
1044 }
1045 }
1046 if (key.ao_key) {
1047 struct tcp_ao_key *rnext_key;
1048
1049 key.traffic_key = snd_other_key(key.ao_key);
1050 key.sne = READ_ONCE(ao_info->snd_sne);
1051 rnext_key = READ_ONCE(ao_info->rnext_key);
1052 key.rcv_next = rnext_key->rcvid;
1053 key.type = TCP_KEY_AO;
1054 #else
1055 if (0) {
1056 #endif
1057 } else if (static_branch_tcp_md5()) {
1058 key.md5_key = tcp_twsk_md5_key(tcptw);
1059 if (key.md5_key)
1060 key.type = TCP_KEY_MD5;
1061 }
1062
1063 tcp_v4_send_ack(sk, skb,
1064 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1065 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1066 tcp_tw_tsval(tcptw),
1067 READ_ONCE(tcptw->tw_ts_recent),
1068 tw->tw_bound_dev_if, &key,
1069 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1070 tos,
1071 tw->tw_txhash);
1072
1073 inet_twsk_put(tw);
1074 }
1075
1076 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1077 struct request_sock *req)
1078 {
1079 struct tcp_key key = {};
1080
1081 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1082 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1083 */
1084 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1085 tcp_sk(sk)->snd_nxt;
1086
1087 #ifdef CONFIG_TCP_AO
1088 if (static_branch_unlikely(&tcp_ao_needed.key) &&
1089 tcp_rsk_used_ao(req)) {
1090 const union tcp_md5_addr *addr;
1091 const struct tcp_ao_hdr *aoh;
1092 int l3index;
1093
1094 /* Invalid TCP option size or twice included auth */
1095 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1096 return;
1097 if (!aoh)
1098 return;
1099
1100 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1101 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1102 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1103 aoh->rnext_keyid, -1);
1104 if (unlikely(!key.ao_key)) {
1105 /* Send ACK with any matching MKT for the peer */
1106 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1107 /* Matching key disappeared (user removed the key?)
1108 * let the handshake timeout.
1109 */
1110 if (!key.ao_key) {
1111 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1112 addr,
1113 ntohs(tcp_hdr(skb)->source),
1114 &ip_hdr(skb)->daddr,
1115 ntohs(tcp_hdr(skb)->dest));
1116 return;
1117 }
1118 }
1119 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1120 if (!key.traffic_key)
1121 return;
1122
1123 key.type = TCP_KEY_AO;
1124 key.rcv_next = aoh->keyid;
1125 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1126 #else
1127 if (0) {
1128 #endif
1129 } else if (static_branch_tcp_md5()) {
1130 const union tcp_md5_addr *addr;
1131 int l3index;
1132
1133 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1134 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1135 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1136 if (key.md5_key)
1137 key.type = TCP_KEY_MD5;
1138 }
1139
1140 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1141 tcp_v4_send_ack(sk, skb, seq,
1142 tcp_rsk(req)->rcv_nxt,
1143 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1144 tcp_rsk_tsval(tcp_rsk(req)),
1145 req->ts_recent,
1146 0, &key,
1147 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1148 ip_hdr(skb)->tos & ~INET_ECN_MASK,
1149 READ_ONCE(tcp_rsk(req)->txhash));
1150 if (tcp_key_is_ao(&key))
1151 kfree(key.traffic_key);
1152 }
1153
1154 /*
1155 * Send a SYN-ACK after having received a SYN.
1156 * This still operates on a request_sock only, not on a big
1157 * socket.
1158 */
1159 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1160 struct flowi *fl,
1161 struct request_sock *req,
1162 struct tcp_fastopen_cookie *foc,
1163 enum tcp_synack_type synack_type,
1164 struct sk_buff *syn_skb)
1165 {
1166 struct inet_request_sock *ireq = inet_rsk(req);
1167 struct flowi4 fl4;
1168 int err = -1;
1169 struct sk_buff *skb;
1170 u8 tos;
1171
1172 /* First, grab a route. */
1173 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1174 return -1;
1175
1176 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1177
1178 if (skb) {
1179 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1180 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1181
1182 tos = READ_ONCE(inet_sk(sk)->tos);
1183
1184 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1185 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1186 (tos & INET_ECN_MASK);
1187
1188 if (!INET_ECN_is_capable(tos) &&
1189 tcp_bpf_ca_needs_ecn((struct sock *)req))
1190 tos |= INET_ECN_ECT_0;
1191
1192 rcu_read_lock();
1193 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1194 ireq->ir_rmt_addr,
1195 rcu_dereference(ireq->ireq_opt),
1196 tos);
1197 rcu_read_unlock();
1198 err = net_xmit_eval(err);
1199 }
1200
1201 return err;
1202 }
1203
1204 /*
1205 * IPv4 request_sock destructor.
1206 */
1207 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1208 {
1209 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1210 }
1211
1212 #ifdef CONFIG_TCP_MD5SIG
1213 /*
1214 * RFC2385 MD5 checksumming requires a mapping of
1215 * IP address->MD5 Key.
1216 * We need to maintain these in the sk structure.
1217 */
1218
1219 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1220
1221 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1222 {
1223 if (!old)
1224 return true;
1225
1226 /* l3index always overrides non-l3index */
1227 if (old->l3index && new->l3index == 0)
1228 return false;
1229 if (old->l3index == 0 && new->l3index)
1230 return true;
1231
1232 return old->prefixlen < new->prefixlen;
1233 }
1234
1235 /* Find the Key structure for an address. */
1236 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1237 const union tcp_md5_addr *addr,
1238 int family, bool any_l3index)
1239 {
1240 const struct tcp_sock *tp = tcp_sk(sk);
1241 struct tcp_md5sig_key *key;
1242 const struct tcp_md5sig_info *md5sig;
1243 __be32 mask;
1244 struct tcp_md5sig_key *best_match = NULL;
1245 bool match;
1246
1247 /* caller either holds rcu_read_lock() or socket lock */
1248 md5sig = rcu_dereference_check(tp->md5sig_info,
1249 lockdep_sock_is_held(sk));
1250 if (!md5sig)
1251 return NULL;
1252
1253 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1254 lockdep_sock_is_held(sk)) {
1255 if (key->family != family)
1256 continue;
1257 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1258 key->l3index != l3index)
1259 continue;
1260 if (family == AF_INET) {
1261 mask = inet_make_mask(key->prefixlen);
1262 match = (key->addr.a4.s_addr & mask) ==
1263 (addr->a4.s_addr & mask);
1264 #if IS_ENABLED(CONFIG_IPV6)
1265 } else if (family == AF_INET6) {
1266 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1267 key->prefixlen);
1268 #endif
1269 } else {
1270 match = false;
1271 }
1272
1273 if (match && better_md5_match(best_match, key))
1274 best_match = key;
1275 }
1276 return best_match;
1277 }
1278
1279 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1280 const union tcp_md5_addr *addr,
1281 int family, u8 prefixlen,
1282 int l3index, u8 flags)
1283 {
1284 const struct tcp_sock *tp = tcp_sk(sk);
1285 struct tcp_md5sig_key *key;
1286 unsigned int size = sizeof(struct in_addr);
1287 const struct tcp_md5sig_info *md5sig;
1288
1289 /* caller either holds rcu_read_lock() or socket lock */
1290 md5sig = rcu_dereference_check(tp->md5sig_info,
1291 lockdep_sock_is_held(sk));
1292 if (!md5sig)
1293 return NULL;
1294 #if IS_ENABLED(CONFIG_IPV6)
1295 if (family == AF_INET6)
1296 size = sizeof(struct in6_addr);
1297 #endif
1298 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1299 lockdep_sock_is_held(sk)) {
1300 if (key->family != family)
1301 continue;
1302 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1303 continue;
1304 if (key->l3index != l3index)
1305 continue;
1306 if (!memcmp(&key->addr, addr, size) &&
1307 key->prefixlen == prefixlen)
1308 return key;
1309 }
1310 return NULL;
1311 }
1312
1313 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1314 const struct sock *addr_sk)
1315 {
1316 const union tcp_md5_addr *addr;
1317 int l3index;
1318
1319 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1320 addr_sk->sk_bound_dev_if);
1321 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1322 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1323 }
1324
1325 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1326 {
1327 struct tcp_sock *tp = tcp_sk(sk);
1328 struct tcp_md5sig_info *md5sig;
1329
1330 md5sig = kmalloc_obj(*md5sig, gfp);
1331 if (!md5sig)
1332 return -ENOMEM;
1333
1334 sk_gso_disable(sk);
1335 INIT_HLIST_HEAD(&md5sig->head);
1336 rcu_assign_pointer(tp->md5sig_info, md5sig);
1337 return 0;
1338 }
1339
1340 /* This can be called on a newly created socket, from other files */
1341 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1342 int family, u8 prefixlen, int l3index, u8 flags,
1343 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1344 {
1345 /* Add Key to the list */
1346 struct tcp_md5sig_key *key;
1347 struct tcp_sock *tp = tcp_sk(sk);
1348 struct tcp_md5sig_info *md5sig;
1349
1350 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1351 if (key) {
1352 /* Pre-existing entry - just update that one.
1353 * Note that the key might be used concurrently.
1354 * data_race() is telling kcsan that we do not care of
1355 * key mismatches, since changing MD5 key on live flows
1356 * can lead to packet drops.
1357 */
1358 data_race(memcpy(key->key, newkey, newkeylen));
1359
1360 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1361 * Also note that a reader could catch new key->keylen value
1362 * but old key->key[], this is the reason we use __GFP_ZERO
1363 * at sock_kmalloc() time below these lines.
1364 */
1365 WRITE_ONCE(key->keylen, newkeylen);
1366
1367 return 0;
1368 }
1369
1370 md5sig = rcu_dereference_protected(tp->md5sig_info,
1371 lockdep_sock_is_held(sk));
1372
1373 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1374 if (!key)
1375 return -ENOMEM;
1376
1377 memcpy(key->key, newkey, newkeylen);
1378 key->keylen = newkeylen;
1379 key->family = family;
1380 key->prefixlen = prefixlen;
1381 key->l3index = l3index;
1382 key->flags = flags;
1383 memcpy(&key->addr, addr,
1384 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1385 sizeof(struct in_addr));
1386 hlist_add_head_rcu(&key->node, &md5sig->head);
1387 return 0;
1388 }
1389
1390 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1391 int family, u8 prefixlen, int l3index, u8 flags,
1392 const u8 *newkey, u8 newkeylen)
1393 {
1394 struct tcp_sock *tp = tcp_sk(sk);
1395
1396 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1397 if (fips_enabled) {
1398 pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1399 return -EOPNOTSUPP;
1400 }
1401
1402 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1403 return -ENOMEM;
1404
1405 if (!static_branch_inc(&tcp_md5_needed.key)) {
1406 struct tcp_md5sig_info *md5sig;
1407
1408 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1409 rcu_assign_pointer(tp->md5sig_info, NULL);
1410 kfree_rcu(md5sig, rcu);
1411 return -EUSERS;
1412 }
1413 }
1414
1415 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1416 newkey, newkeylen, GFP_KERNEL);
1417 }
1418
1419 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1420 int family, u8 prefixlen, int l3index,
1421 struct tcp_md5sig_key *key)
1422 {
1423 struct tcp_sock *tp = tcp_sk(sk);
1424
1425 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1426
1427 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1428 return -ENOMEM;
1429
1430 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1431 struct tcp_md5sig_info *md5sig;
1432
1433 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1434 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1435 rcu_assign_pointer(tp->md5sig_info, NULL);
1436 kfree_rcu(md5sig, rcu);
1437 return -EUSERS;
1438 }
1439 }
1440
1441 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1442 key->flags, key->key, key->keylen,
1443 sk_gfp_mask(sk, GFP_ATOMIC));
1444 }
1445
1446 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1447 u8 prefixlen, int l3index, u8 flags)
1448 {
1449 struct tcp_md5sig_key *key;
1450
1451 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1452 if (!key)
1453 return -ENOENT;
1454 hlist_del_rcu(&key->node);
1455 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1456 kfree_rcu(key, rcu);
1457 return 0;
1458 }
1459
1460 void tcp_clear_md5_list(struct sock *sk)
1461 {
1462 struct tcp_sock *tp = tcp_sk(sk);
1463 struct tcp_md5sig_key *key;
1464 struct hlist_node *n;
1465 struct tcp_md5sig_info *md5sig;
1466
1467 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1468
1469 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1470 hlist_del(&key->node);
1471 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1472 kfree(key);
1473 }
1474 }
1475
1476 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1477 sockptr_t optval, int optlen)
1478 {
1479 struct tcp_md5sig cmd;
1480 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1481 const union tcp_md5_addr *addr;
1482 u8 prefixlen = 32;
1483 int l3index = 0;
1484 bool l3flag;
1485 u8 flags;
1486
1487 if (optlen < sizeof(cmd))
1488 return -EINVAL;
1489
1490 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1491 return -EFAULT;
1492
1493 if (sin->sin_family != AF_INET)
1494 return -EINVAL;
1495
1496 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1497 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1498
1499 if (optname == TCP_MD5SIG_EXT &&
1500 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1501 prefixlen = cmd.tcpm_prefixlen;
1502 if (prefixlen > 32)
1503 return -EINVAL;
1504 }
1505
1506 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1507 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1508 struct net_device *dev;
1509
1510 rcu_read_lock();
1511 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1512 if (dev && netif_is_l3_master(dev))
1513 l3index = dev->ifindex;
1514
1515 rcu_read_unlock();
1516
1517 /* ok to reference set/not set outside of rcu;
1518 * right now device MUST be an L3 master
1519 */
1520 if (!dev || !l3index)
1521 return -EINVAL;
1522 }
1523
1524 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1525
1526 if (!cmd.tcpm_keylen)
1527 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1528
1529 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1530 return -EINVAL;
1531
1532 /* Don't allow keys for peers that have a matching TCP-AO key.
1533 * See the comment in tcp_ao_add_cmd()
1534 */
1535 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1536 return -EKEYREJECTED;
1537
1538 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1539 cmd.tcpm_key, cmd.tcpm_keylen);
1540 }
1541
1542 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1543 __be32 daddr, __be32 saddr,
1544 const struct tcphdr *th, int nbytes)
1545 {
1546 struct {
1547 struct tcp4_pseudohdr ip;
1548 struct tcphdr tcp;
1549 } h;
1550
1551 h.ip.saddr = saddr;
1552 h.ip.daddr = daddr;
1553 h.ip.pad = 0;
1554 h.ip.protocol = IPPROTO_TCP;
1555 h.ip.len = cpu_to_be16(nbytes);
1556 h.tcp = *th;
1557 h.tcp.check = 0;
1558 md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
1559 }
1560
1561 static noinline_for_stack void
1562 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1563 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1564 {
1565 struct md5_ctx ctx;
1566
1567 md5_init(&ctx);
1568 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
1569 tcp_md5_hash_key(&ctx, key);
1570 md5_final(&ctx, md5_hash);
1571 }
1572
1573 noinline_for_stack void
1574 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1575 const struct sock *sk, const struct sk_buff *skb)
1576 {
1577 const struct tcphdr *th = tcp_hdr(skb);
1578 __be32 saddr, daddr;
1579 struct md5_ctx ctx;
1580
1581 if (sk) { /* valid for establish/request sockets */
1582 saddr = sk->sk_rcv_saddr;
1583 daddr = sk->sk_daddr;
1584 } else {
1585 const struct iphdr *iph = ip_hdr(skb);
1586 saddr = iph->saddr;
1587 daddr = iph->daddr;
1588 }
1589
1590 md5_init(&ctx);
1591 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
1592 tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
1593 tcp_md5_hash_key(&ctx, key);
1594 md5_final(&ctx, md5_hash);
1595 }
1596
1597 #endif
1598
1599 static void tcp_v4_init_req(struct request_sock *req,
1600 const struct sock *sk_listener,
1601 struct sk_buff *skb)
1602 {
1603 struct inet_request_sock *ireq = inet_rsk(req);
1604 struct net *net = sock_net(sk_listener);
1605
1606 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1607 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1608 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1609 }
1610
1611 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1612 struct sk_buff *skb,
1613 struct flowi *fl,
1614 struct request_sock *req,
1615 u32 tw_isn)
1616 {
1617 tcp_v4_init_req(req, sk, skb);
1618
1619 if (security_inet_conn_request(sk, skb, req))
1620 return NULL;
1621
1622 return inet_csk_route_req(sk, &fl->u.ip4, req);
1623 }
1624
1625 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1626 .family = PF_INET,
1627 .obj_size = sizeof(struct tcp_request_sock),
1628 .send_ack = tcp_v4_reqsk_send_ack,
1629 .destructor = tcp_v4_reqsk_destructor,
1630 .send_reset = tcp_v4_send_reset,
1631 };
1632
1633 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1634 .mss_clamp = TCP_MSS_DEFAULT,
1635 #ifdef CONFIG_TCP_MD5SIG
1636 .req_md5_lookup = tcp_v4_md5_lookup,
1637 .calc_md5_hash = tcp_v4_md5_hash_skb,
1638 #endif
1639 #ifdef CONFIG_TCP_AO
1640 .ao_lookup = tcp_v4_ao_lookup_rsk,
1641 .ao_calc_key = tcp_v4_ao_calc_key_rsk,
1642 .ao_synack_hash = tcp_v4_ao_synack_hash,
1643 #endif
1644 #ifdef CONFIG_SYN_COOKIES
1645 .cookie_init_seq = cookie_v4_init_sequence,
1646 #endif
1647 .route_req = tcp_v4_route_req,
1648 .init_seq_and_ts_off = tcp_v4_init_seq_and_ts_off,
1649 .send_synack = tcp_v4_send_synack,
1650 };
1651
1652 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1653 {
1654 /* Never answer to SYNs send to broadcast or multicast */
1655 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1656 goto drop;
1657
1658 return tcp_conn_request(&tcp_request_sock_ops,
1659 &tcp_request_sock_ipv4_ops, sk, skb);
1660
1661 drop:
1662 tcp_listendrop(sk);
1663 return 0;
1664 }
1665
1666
1667 /*
1668 * The three way handshake has completed - we got a valid synack -
1669 * now create the new socket.
1670 */
1671 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1672 struct request_sock *req,
1673 struct dst_entry *dst,
1674 struct request_sock *req_unhash,
1675 bool *own_req,
1676 void (*opt_child_init)(struct sock *newsk,
1677 const struct sock *sk))
1678 {
1679 struct inet_request_sock *ireq;
1680 bool found_dup_sk = false;
1681 struct inet_sock *newinet;
1682 struct tcp_sock *newtp;
1683 struct sock *newsk;
1684 #ifdef CONFIG_TCP_MD5SIG
1685 const union tcp_md5_addr *addr;
1686 struct tcp_md5sig_key *key;
1687 int l3index;
1688 #endif
1689 struct ip_options_rcu *inet_opt;
1690
1691 if (sk_acceptq_is_full(sk))
1692 goto exit_overflow;
1693
1694 newsk = tcp_create_openreq_child(sk, req, skb);
1695 if (!newsk)
1696 goto exit_nonewsk;
1697
1698 newsk->sk_gso_type = SKB_GSO_TCPV4;
1699 inet_sk_rx_dst_set(newsk, skb);
1700
1701 newtp = tcp_sk(newsk);
1702 newinet = inet_sk(newsk);
1703 ireq = inet_rsk(req);
1704 inet_opt = rcu_dereference(ireq->ireq_opt);
1705 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1706 newinet->mc_index = inet_iif(skb);
1707 newinet->mc_ttl = ip_hdr(skb)->ttl;
1708 newinet->rcv_tos = ip_hdr(skb)->tos;
1709 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1710 if (inet_opt)
1711 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1712 atomic_set(&newinet->inet_id, get_random_u16());
1713
1714 /* Set ToS of the new socket based upon the value of incoming SYN.
1715 * ECT bits are set later in tcp_init_transfer().
1716 */
1717 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1718 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1719
1720 if (!dst) {
1721 dst = inet_csk_route_child_sock(sk, newsk, req);
1722 if (!dst)
1723 goto put_and_exit;
1724 } else {
1725 /* syncookie case : see end of cookie_v4_check() */
1726 }
1727 sk_setup_caps(newsk, dst);
1728
1729 #if IS_ENABLED(CONFIG_IPV6)
1730 if (opt_child_init)
1731 opt_child_init(newsk, sk);
1732 #endif
1733 tcp_ca_openreq_child(newsk, dst);
1734
1735 tcp_sync_mss(newsk, dst4_mtu(dst));
1736 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1737
1738 tcp_initialize_rcv_mss(newsk);
1739
1740 #ifdef CONFIG_TCP_MD5SIG
1741 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1742 /* Copy over the MD5 key from the original socket */
1743 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1744 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1745 if (key && !tcp_rsk_used_ao(req)) {
1746 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1747 goto put_and_exit;
1748 sk_gso_disable(newsk);
1749 }
1750 #endif
1751 #ifdef CONFIG_TCP_AO
1752 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1753 goto put_and_exit; /* OOM, release back memory */
1754 #endif
1755
1756 if (__inet_inherit_port(sk, newsk) < 0)
1757 goto put_and_exit;
1758 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1759 &found_dup_sk);
1760 if (likely(*own_req)) {
1761 tcp_move_syn(newtp, req);
1762 ireq->ireq_opt = NULL;
1763 } else {
1764 newinet->inet_opt = NULL;
1765
1766 if (!req_unhash && found_dup_sk) {
1767 /* This code path should only be executed in the
1768 * syncookie case only
1769 */
1770 bh_unlock_sock(newsk);
1771 sock_put(newsk);
1772 newsk = NULL;
1773 }
1774 }
1775 return newsk;
1776
1777 exit_overflow:
1778 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1779 exit_nonewsk:
1780 dst_release(dst);
1781 exit:
1782 tcp_listendrop(sk);
1783 return NULL;
1784 put_and_exit:
1785 newinet->inet_opt = NULL;
1786 inet_csk_prepare_forced_close(newsk);
1787 tcp_done(newsk);
1788 goto exit;
1789 }
1790
1791 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1792 {
1793 #ifdef CONFIG_SYN_COOKIES
1794 const struct tcphdr *th = tcp_hdr(skb);
1795
1796 if (!th->syn)
1797 sk = cookie_v4_check(sk, skb);
1798 #endif
1799 return sk;
1800 }
1801
1802 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1803 struct tcphdr *th, u32 *cookie)
1804 {
1805 u16 mss = 0;
1806 #ifdef CONFIG_SYN_COOKIES
1807 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1808 &tcp_request_sock_ipv4_ops, sk, th);
1809 if (mss) {
1810 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1811 tcp_synq_overflow(sk);
1812 }
1813 #endif
1814 return mss;
1815 }
1816
1817 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1818 u32));
1819 /* The socket must have it's spinlock held when we get
1820 * here, unless it is a TCP_LISTEN socket.
1821 *
1822 * We have a potential double-lock case here, so even when
1823 * doing backlog processing we use the BH locking scheme.
1824 * This is because we cannot sleep with the original spinlock
1825 * held.
1826 */
1827 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1828 {
1829 enum skb_drop_reason reason;
1830
1831 reason = psp_sk_rx_policy_check(sk, skb);
1832 if (reason)
1833 goto err_discard;
1834
1835 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1836 struct dst_entry *dst;
1837
1838 dst = rcu_dereference_protected(sk->sk_rx_dst,
1839 lockdep_sock_is_held(sk));
1840
1841 sock_rps_save_rxhash(sk, skb);
1842 sk_mark_napi_id(sk, skb);
1843 if (dst && unlikely(dst != skb_dst(skb))) {
1844 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1845 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1846 dst, 0)) {
1847 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1848 dst_release(dst);
1849 }
1850 }
1851 tcp_rcv_established(sk, skb);
1852 return 0;
1853 }
1854
1855 if (tcp_checksum_complete(skb))
1856 goto csum_err;
1857
1858 if (sk->sk_state == TCP_LISTEN) {
1859 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1860
1861 if (!nsk)
1862 return 0;
1863 if (nsk != sk) {
1864 reason = tcp_child_process(sk, nsk, skb);
1865 sock_put(nsk);
1866 if (reason)
1867 goto reset;
1868 return 0;
1869 }
1870 } else
1871 sock_rps_save_rxhash(sk, skb);
1872
1873 reason = tcp_rcv_state_process(sk, skb);
1874 if (reason)
1875 goto reset;
1876 return 0;
1877
1878 reset:
1879 tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
1880 discard:
1881 sk_skb_reason_drop(sk, skb, reason);
1882 /* Be careful here. If this function gets more complicated and
1883 * gcc suffers from register pressure on the x86, sk (in %ebx)
1884 * might be destroyed here. This current version compiles correctly,
1885 * but you have been warned.
1886 */
1887 return 0;
1888
1889 csum_err:
1890 reason = SKB_DROP_REASON_TCP_CSUM;
1891 trace_tcp_bad_csum(skb);
1892 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1893 err_discard:
1894 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1895 goto discard;
1896 }
1897 EXPORT_SYMBOL(tcp_v4_do_rcv);
1898
1899 enum skb_drop_reason tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1900 {
1901 u32 tail_gso_size, tail_gso_segs;
1902 struct skb_shared_info *shinfo;
1903 const struct tcphdr *th;
1904 struct tcphdr *thtail;
1905 struct sk_buff *tail;
1906 unsigned int hdrlen;
1907 bool fragstolen;
1908 u32 gso_segs;
1909 u32 gso_size;
1910 u64 limit;
1911 int delta;
1912 int err;
1913
1914 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1915 * we can fix skb->truesize to its real value to avoid future drops.
1916 * This is valid because skb is not yet charged to the socket.
1917 * It has been noticed pure SACK packets were sometimes dropped
1918 * (if cooked by drivers without copybreak feature).
1919 */
1920 skb_condense(skb);
1921
1922 tcp_cleanup_skb(skb);
1923
1924 if (unlikely(tcp_checksum_complete(skb))) {
1925 bh_unlock_sock(sk);
1926 trace_tcp_bad_csum(skb);
1927 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1928 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1929 return SKB_DROP_REASON_TCP_CSUM;
1930 }
1931
1932 /* Attempt coalescing to last skb in backlog, even if we are
1933 * above the limits.
1934 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1935 */
1936 th = (const struct tcphdr *)skb->data;
1937 hdrlen = th->doff * 4;
1938
1939 tail = sk->sk_backlog.tail;
1940 if (!tail)
1941 goto no_coalesce;
1942 thtail = (struct tcphdr *)tail->data;
1943
1944 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1945 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1946 ((TCP_SKB_CB(tail)->tcp_flags |
1947 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1948 !((TCP_SKB_CB(tail)->tcp_flags &
1949 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1950 ((TCP_SKB_CB(tail)->tcp_flags ^
1951 TCP_SKB_CB(skb)->tcp_flags) &
1952 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
1953 !tcp_skb_can_collapse_rx(tail, skb) ||
1954 thtail->doff != th->doff ||
1955 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
1956 /* prior to PSP Rx policy check, retain exact PSP metadata */
1957 psp_skb_coalesce_diff(tail, skb))
1958 goto no_coalesce;
1959
1960 __skb_pull(skb, hdrlen);
1961
1962 shinfo = skb_shinfo(skb);
1963 gso_size = shinfo->gso_size ?: skb->len;
1964 gso_segs = shinfo->gso_segs ?: 1;
1965
1966 shinfo = skb_shinfo(tail);
1967 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1968 tail_gso_segs = shinfo->gso_segs ?: 1;
1969
1970 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1971 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1972
1973 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1974 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1975 thtail->window = th->window;
1976 }
1977
1978 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1979 * thtail->fin, so that the fast path in tcp_rcv_established()
1980 * is not entered if we append a packet with a FIN.
1981 * SYN, RST, URG are not present.
1982 * ACK is set on both packets.
1983 * PSH : we do not really care in TCP stack,
1984 * at least for 'GRO' packets.
1985 */
1986 thtail->fin |= th->fin;
1987 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1988
1989 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1990 TCP_SKB_CB(tail)->has_rxtstamp = true;
1991 tail->tstamp = skb->tstamp;
1992 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1993 }
1994
1995 /* Not as strict as GRO. We only need to carry mss max value */
1996 shinfo->gso_size = max(gso_size, tail_gso_size);
1997 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1998
1999 sk->sk_backlog.len += delta;
2000 __NET_INC_STATS(sock_net(sk),
2001 LINUX_MIB_TCPBACKLOGCOALESCE);
2002 kfree_skb_partial(skb, fragstolen);
2003 return SKB_NOT_DROPPED_YET;
2004 }
2005 __skb_push(skb, hdrlen);
2006
2007 no_coalesce:
2008 /* sk->sk_backlog.len is reset only at the end of __release_sock().
2009 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2010 * sk_rcvbuf in normal conditions.
2011 */
2012 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2013
2014 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2015
2016 /* Only socket owner can try to collapse/prune rx queues
2017 * to reduce memory overhead, so add a little headroom here.
2018 * Few sockets backlog are possibly concurrently non empty.
2019 */
2020 limit += 64 * 1024;
2021
2022 limit = min_t(u64, limit, UINT_MAX);
2023
2024 err = sk_add_backlog(sk, skb, limit);
2025 if (unlikely(err)) {
2026 bh_unlock_sock(sk);
2027 if (err == -ENOMEM) {
2028 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2029 return SKB_DROP_REASON_PFMEMALLOC;
2030 }
2031 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2032 return SKB_DROP_REASON_SOCKET_BACKLOG;
2033 }
2034 return SKB_NOT_DROPPED_YET;
2035 }
2036
2037 static void tcp_v4_restore_cb(struct sk_buff *skb)
2038 {
2039 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2040 sizeof(struct inet_skb_parm));
2041 }
2042
2043 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2044 const struct tcphdr *th)
2045 {
2046 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2047 * barrier() makes sure compiler wont play fool^Waliasing games.
2048 */
2049 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2050 sizeof(struct inet_skb_parm));
2051 barrier();
2052
2053 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2054 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2055 skb->len - th->doff * 4);
2056 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2057 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2058 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2059 TCP_SKB_CB(skb)->sacked = 0;
2060 TCP_SKB_CB(skb)->has_rxtstamp =
2061 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2062 }
2063
2064 /*
2065 * From tcp_input.c
2066 */
2067
2068 int tcp_v4_rcv(struct sk_buff *skb)
2069 {
2070 struct net *net = dev_net_rcu(skb->dev);
2071 enum skb_drop_reason drop_reason;
2072 enum tcp_tw_status tw_status;
2073 int sdif = inet_sdif(skb);
2074 int dif = inet_iif(skb);
2075 const struct iphdr *iph;
2076 const struct tcphdr *th;
2077 struct sock *sk = NULL;
2078 bool refcounted;
2079 int ret;
2080 u32 isn;
2081
2082 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2083 if (skb->pkt_type != PACKET_HOST)
2084 goto discard_it;
2085
2086 /* Count it even if it's bad */
2087 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2088
2089 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2090 goto discard_it;
2091
2092 th = (const struct tcphdr *)skb->data;
2093
2094 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2095 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2096 goto bad_packet;
2097 }
2098 if (!pskb_may_pull(skb, th->doff * 4))
2099 goto discard_it;
2100
2101 /* An explanation is required here, I think.
2102 * Packet length and doff are validated by header prediction,
2103 * provided case of th->doff==0 is eliminated.
2104 * So, we defer the checks. */
2105
2106 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2107 goto csum_error;
2108
2109 th = (const struct tcphdr *)skb->data;
2110 iph = ip_hdr(skb);
2111 lookup:
2112 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2113 th->dest, sdif, &refcounted);
2114 if (!sk)
2115 goto no_tcp_socket;
2116
2117 if (sk->sk_state == TCP_TIME_WAIT)
2118 goto do_time_wait;
2119
2120 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2121 struct request_sock *req = inet_reqsk(sk);
2122 bool req_stolen = false;
2123 struct sock *nsk;
2124
2125 sk = req->rsk_listener;
2126 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2127 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2128 else
2129 drop_reason = tcp_inbound_hash(sk, req, skb,
2130 &iph->saddr, &iph->daddr,
2131 AF_INET, dif, sdif);
2132 if (unlikely(drop_reason)) {
2133 sk_drops_skbadd(sk, skb);
2134 reqsk_put(req);
2135 goto discard_it;
2136 }
2137 if (tcp_checksum_complete(skb)) {
2138 reqsk_put(req);
2139 goto csum_error;
2140 }
2141 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2142 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2143 if (!nsk) {
2144 inet_csk_reqsk_queue_drop_and_put(sk, req);
2145 goto lookup;
2146 }
2147 sk = nsk;
2148 /* reuseport_migrate_sock() has already held one sk_refcnt
2149 * before returning.
2150 */
2151 } else {
2152 /* We own a reference on the listener, increase it again
2153 * as we might lose it too soon.
2154 */
2155 sock_hold(sk);
2156 }
2157 refcounted = true;
2158 nsk = NULL;
2159 drop_reason = tcp_filter(sk, skb);
2160 if (!drop_reason) {
2161 th = (const struct tcphdr *)skb->data;
2162 iph = ip_hdr(skb);
2163 tcp_v4_fill_cb(skb, iph, th);
2164 nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2165 &drop_reason);
2166 }
2167 if (!nsk) {
2168 reqsk_put(req);
2169 if (req_stolen) {
2170 /* Another cpu got exclusive access to req
2171 * and created a full blown socket.
2172 * Try to feed this packet to this socket
2173 * instead of discarding it.
2174 */
2175 tcp_v4_restore_cb(skb);
2176 sock_put(sk);
2177 goto lookup;
2178 }
2179 goto discard_and_relse;
2180 }
2181 nf_reset_ct(skb);
2182 if (nsk == sk) {
2183 reqsk_put(req);
2184 tcp_v4_restore_cb(skb);
2185 } else {
2186 drop_reason = tcp_child_process(sk, nsk, skb);
2187 if (drop_reason) {
2188 enum sk_rst_reason rst_reason;
2189
2190 rst_reason = sk_rst_convert_drop_reason(drop_reason);
2191 tcp_v4_send_reset(nsk, skb, rst_reason);
2192 sock_put(nsk);
2193 goto discard_and_relse;
2194 }
2195 sock_put(nsk);
2196 sock_put(sk);
2197 return 0;
2198 }
2199 }
2200
2201 isn = 0;
2202 process:
2203 if (static_branch_unlikely(&ip4_min_ttl)) {
2204 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2205 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2206 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2207 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2208 goto discard_and_relse;
2209 }
2210 }
2211
2212 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2213 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2214 goto discard_and_relse;
2215 }
2216
2217 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2218 AF_INET, dif, sdif);
2219 if (drop_reason)
2220 goto discard_and_relse;
2221
2222 nf_reset_ct(skb);
2223
2224 drop_reason = tcp_filter(sk, skb);
2225 if (drop_reason)
2226 goto discard_and_relse;
2227
2228 th = (const struct tcphdr *)skb->data;
2229 iph = ip_hdr(skb);
2230 tcp_v4_fill_cb(skb, iph, th);
2231 TCP_SKB_CB(skb)->tcp_tw_isn = isn;
2232
2233 skb->dev = NULL;
2234
2235 if (sk->sk_state == TCP_LISTEN) {
2236 ret = tcp_v4_do_rcv(sk, skb);
2237 goto put_and_return;
2238 }
2239
2240 sk_incoming_cpu_update(sk);
2241
2242 bh_lock_sock_nested(sk);
2243 tcp_segs_in(tcp_sk(sk), skb);
2244 ret = 0;
2245 if (!sock_owned_by_user(sk)) {
2246 ret = tcp_v4_do_rcv(sk, skb);
2247 } else {
2248 drop_reason = tcp_add_backlog(sk, skb);
2249 if (drop_reason)
2250 goto discard_and_relse;
2251 }
2252 bh_unlock_sock(sk);
2253
2254 put_and_return:
2255 if (refcounted)
2256 sock_put(sk);
2257
2258 return ret;
2259
2260 no_tcp_socket:
2261 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2262 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2263 goto discard_it;
2264
2265 tcp_v4_fill_cb(skb, iph, th);
2266
2267 if (tcp_checksum_complete(skb)) {
2268 csum_error:
2269 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2270 trace_tcp_bad_csum(skb);
2271 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2272 bad_packet:
2273 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2274 } else {
2275 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2276 }
2277
2278 discard_it:
2279 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2280 /* Discard frame. */
2281 sk_skb_reason_drop(sk, skb, drop_reason);
2282 return 0;
2283
2284 discard_and_relse:
2285 sk_drops_skbadd(sk, skb);
2286 if (refcounted)
2287 sock_put(sk);
2288 goto discard_it;
2289
2290 do_time_wait:
2291 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2292 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2293 inet_twsk_put(inet_twsk(sk));
2294 goto discard_it;
2295 }
2296
2297 tcp_v4_fill_cb(skb, iph, th);
2298
2299 if (tcp_checksum_complete(skb)) {
2300 inet_twsk_put(inet_twsk(sk));
2301 goto csum_error;
2302 }
2303
2304 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2305 &drop_reason);
2306 switch (tw_status) {
2307 case TCP_TW_SYN: {
2308 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2309 iph->saddr, th->source,
2310 iph->daddr, th->dest,
2311 inet_iif(skb),
2312 sdif);
2313 if (sk2) {
2314 inet_twsk_deschedule_put(inet_twsk(sk));
2315 sk = sk2;
2316 tcp_v4_restore_cb(skb);
2317 refcounted = false;
2318 goto process;
2319 }
2320
2321 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2322 if (drop_reason)
2323 break;
2324 }
2325 /* to ACK */
2326 fallthrough;
2327 case TCP_TW_ACK:
2328 case TCP_TW_ACK_OOW:
2329 tcp_v4_timewait_ack(sk, skb, tw_status);
2330 break;
2331 case TCP_TW_RST:
2332 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2333 inet_twsk_deschedule_put(inet_twsk(sk));
2334 goto discard_it;
2335 case TCP_TW_SUCCESS:;
2336 }
2337 goto discard_it;
2338 }
2339
2340 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2341 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2342 };
2343
2344 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2345 {
2346 struct dst_entry *dst = skb_dst(skb);
2347
2348 if (dst && dst_hold_safe(dst)) {
2349 rcu_assign_pointer(sk->sk_rx_dst, dst);
2350 sk->sk_rx_dst_ifindex = skb->skb_iif;
2351 }
2352 }
2353
2354 const struct inet_connection_sock_af_ops ipv4_specific = {
2355 .queue_xmit = ip_queue_xmit,
2356 .rebuild_header = inet_sk_rebuild_header,
2357 .sk_rx_dst_set = inet_sk_rx_dst_set,
2358 .conn_request = tcp_v4_conn_request,
2359 .syn_recv_sock = tcp_v4_syn_recv_sock,
2360 .net_header_len = sizeof(struct iphdr),
2361 .setsockopt = ip_setsockopt,
2362 .getsockopt = ip_getsockopt,
2363 .mtu_reduced = tcp_v4_mtu_reduced,
2364 };
2365
2366 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2367 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2368 #ifdef CONFIG_TCP_MD5SIG
2369 .md5_lookup = tcp_v4_md5_lookup,
2370 .calc_md5_hash = tcp_v4_md5_hash_skb,
2371 .md5_parse = tcp_v4_parse_md5_keys,
2372 #endif
2373 #ifdef CONFIG_TCP_AO
2374 .ao_lookup = tcp_v4_ao_lookup,
2375 .calc_ao_hash = tcp_v4_ao_hash_skb,
2376 .ao_parse = tcp_v4_parse_ao,
2377 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2378 #endif
2379 };
2380
2381 static void tcp4_destruct_sock(struct sock *sk)
2382 {
2383 tcp_md5_destruct_sock(sk);
2384 tcp_ao_destroy_sock(sk, false);
2385 inet_sock_destruct(sk);
2386 }
2387 #endif
2388
2389 /* NOTE: A lot of things set to zero explicitly by call to
2390 * sk_alloc() so need not be done here.
2391 */
2392 static int tcp_v4_init_sock(struct sock *sk)
2393 {
2394 struct inet_connection_sock *icsk = inet_csk(sk);
2395
2396 tcp_init_sock(sk);
2397
2398 icsk->icsk_af_ops = &ipv4_specific;
2399
2400 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2401 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2402 sk->sk_destruct = tcp4_destruct_sock;
2403 #endif
2404
2405 return 0;
2406 }
2407
2408 static void tcp_release_user_frags(struct sock *sk)
2409 {
2410 #ifdef CONFIG_PAGE_POOL
2411 unsigned long index;
2412 void *netmem;
2413
2414 xa_for_each(&sk->sk_user_frags, index, netmem)
2415 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2416 #endif
2417 }
2418
2419 void tcp_v4_destroy_sock(struct sock *sk)
2420 {
2421 struct tcp_sock *tp = tcp_sk(sk);
2422
2423 tcp_release_user_frags(sk);
2424
2425 xa_destroy(&sk->sk_user_frags);
2426
2427 trace_tcp_destroy_sock(sk);
2428
2429 tcp_clear_xmit_timers(sk);
2430
2431 tcp_cleanup_congestion_control(sk);
2432
2433 tcp_cleanup_ulp(sk);
2434
2435 /* Cleanup up the write buffer. */
2436 tcp_write_queue_purge(sk);
2437
2438 /* Check if we want to disable active TFO */
2439 tcp_fastopen_active_disable_ofo_check(sk);
2440
2441 /* Cleans up our, hopefully empty, out_of_order_queue. */
2442 skb_rbtree_purge(&tp->out_of_order_queue);
2443
2444 /* Clean up a referenced TCP bind bucket. */
2445 if (inet_csk(sk)->icsk_bind_hash)
2446 inet_put_port(sk);
2447
2448 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2449
2450 /* If socket is aborted during connect operation */
2451 tcp_free_fastopen_req(tp);
2452 tcp_fastopen_destroy_cipher(sk);
2453 tcp_saved_syn_free(tp);
2454
2455 sk_sockets_allocated_dec(sk);
2456 }
2457
2458 #ifdef CONFIG_PROC_FS
2459 /* Proc filesystem TCP sock list dumping. */
2460
2461 static unsigned short seq_file_family(const struct seq_file *seq);
2462
2463 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2464 {
2465 unsigned short family = seq_file_family(seq);
2466
2467 /* AF_UNSPEC is used as a match all */
2468 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2469 net_eq(sock_net(sk), seq_file_net(seq)));
2470 }
2471
2472 /* Find a non empty bucket (starting from st->bucket)
2473 * and return the first sk from it.
2474 */
2475 static void *listening_get_first(struct seq_file *seq)
2476 {
2477 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2478 struct tcp_iter_state *st = seq->private;
2479
2480 st->offset = 0;
2481 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2482 struct inet_listen_hashbucket *ilb2;
2483 struct hlist_nulls_node *node;
2484 struct sock *sk;
2485
2486 ilb2 = &hinfo->lhash2[st->bucket];
2487 if (hlist_nulls_empty(&ilb2->nulls_head))
2488 continue;
2489
2490 spin_lock(&ilb2->lock);
2491 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2492 if (seq_sk_match(seq, sk))
2493 return sk;
2494 }
2495 spin_unlock(&ilb2->lock);
2496 }
2497
2498 return NULL;
2499 }
2500
2501 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2502 * If "cur" is the last one in the st->bucket,
2503 * call listening_get_first() to return the first sk of the next
2504 * non empty bucket.
2505 */
2506 static void *listening_get_next(struct seq_file *seq, void *cur)
2507 {
2508 struct tcp_iter_state *st = seq->private;
2509 struct inet_listen_hashbucket *ilb2;
2510 struct hlist_nulls_node *node;
2511 struct inet_hashinfo *hinfo;
2512 struct sock *sk = cur;
2513
2514 ++st->num;
2515 ++st->offset;
2516
2517 sk = sk_nulls_next(sk);
2518 sk_nulls_for_each_from(sk, node) {
2519 if (seq_sk_match(seq, sk))
2520 return sk;
2521 }
2522
2523 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2524 ilb2 = &hinfo->lhash2[st->bucket];
2525 spin_unlock(&ilb2->lock);
2526 ++st->bucket;
2527 return listening_get_first(seq);
2528 }
2529
2530 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2531 {
2532 struct tcp_iter_state *st = seq->private;
2533 void *rc;
2534
2535 st->bucket = 0;
2536 st->offset = 0;
2537 rc = listening_get_first(seq);
2538
2539 while (rc && *pos) {
2540 rc = listening_get_next(seq, rc);
2541 --*pos;
2542 }
2543 return rc;
2544 }
2545
2546 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2547 const struct tcp_iter_state *st)
2548 {
2549 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2550 }
2551
2552 /*
2553 * Get first established socket starting from bucket given in st->bucket.
2554 * If st->bucket is zero, the very first socket in the hash is returned.
2555 */
2556 static void *established_get_first(struct seq_file *seq)
2557 {
2558 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2559 struct tcp_iter_state *st = seq->private;
2560
2561 st->offset = 0;
2562 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2563 struct sock *sk;
2564 struct hlist_nulls_node *node;
2565 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2566
2567 cond_resched();
2568
2569 /* Lockless fast path for the common case of empty buckets */
2570 if (empty_bucket(hinfo, st))
2571 continue;
2572
2573 spin_lock_bh(lock);
2574 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2575 if (seq_sk_match(seq, sk))
2576 return sk;
2577 }
2578 spin_unlock_bh(lock);
2579 }
2580
2581 return NULL;
2582 }
2583
2584 static void *established_get_next(struct seq_file *seq, void *cur)
2585 {
2586 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2587 struct tcp_iter_state *st = seq->private;
2588 struct hlist_nulls_node *node;
2589 struct sock *sk = cur;
2590
2591 ++st->num;
2592 ++st->offset;
2593
2594 sk = sk_nulls_next(sk);
2595
2596 sk_nulls_for_each_from(sk, node) {
2597 if (seq_sk_match(seq, sk))
2598 return sk;
2599 }
2600
2601 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2602 ++st->bucket;
2603 return established_get_first(seq);
2604 }
2605
2606 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2607 {
2608 struct tcp_iter_state *st = seq->private;
2609 void *rc;
2610
2611 st->bucket = 0;
2612 rc = established_get_first(seq);
2613
2614 while (rc && pos) {
2615 rc = established_get_next(seq, rc);
2616 --pos;
2617 }
2618 return rc;
2619 }
2620
2621 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2622 {
2623 void *rc;
2624 struct tcp_iter_state *st = seq->private;
2625
2626 st->state = TCP_SEQ_STATE_LISTENING;
2627 rc = listening_get_idx(seq, &pos);
2628
2629 if (!rc) {
2630 st->state = TCP_SEQ_STATE_ESTABLISHED;
2631 rc = established_get_idx(seq, pos);
2632 }
2633
2634 return rc;
2635 }
2636
2637 static void *tcp_seek_last_pos(struct seq_file *seq)
2638 {
2639 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2640 struct tcp_iter_state *st = seq->private;
2641 int bucket = st->bucket;
2642 int offset = st->offset;
2643 int orig_num = st->num;
2644 void *rc = NULL;
2645
2646 switch (st->state) {
2647 case TCP_SEQ_STATE_LISTENING:
2648 if (st->bucket > hinfo->lhash2_mask)
2649 break;
2650 rc = listening_get_first(seq);
2651 while (offset-- && rc && bucket == st->bucket)
2652 rc = listening_get_next(seq, rc);
2653 if (rc)
2654 break;
2655 st->bucket = 0;
2656 st->state = TCP_SEQ_STATE_ESTABLISHED;
2657 fallthrough;
2658 case TCP_SEQ_STATE_ESTABLISHED:
2659 if (st->bucket > hinfo->ehash_mask)
2660 break;
2661 rc = established_get_first(seq);
2662 while (offset-- && rc && bucket == st->bucket)
2663 rc = established_get_next(seq, rc);
2664 }
2665
2666 st->num = orig_num;
2667
2668 return rc;
2669 }
2670
2671 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2672 {
2673 struct tcp_iter_state *st = seq->private;
2674 void *rc;
2675
2676 if (*pos && *pos == st->last_pos) {
2677 rc = tcp_seek_last_pos(seq);
2678 if (rc)
2679 goto out;
2680 }
2681
2682 st->state = TCP_SEQ_STATE_LISTENING;
2683 st->num = 0;
2684 st->bucket = 0;
2685 st->offset = 0;
2686 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2687
2688 out:
2689 st->last_pos = *pos;
2690 return rc;
2691 }
2692
2693 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2694 {
2695 struct tcp_iter_state *st = seq->private;
2696 void *rc = NULL;
2697
2698 if (v == SEQ_START_TOKEN) {
2699 rc = tcp_get_idx(seq, 0);
2700 goto out;
2701 }
2702
2703 switch (st->state) {
2704 case TCP_SEQ_STATE_LISTENING:
2705 rc = listening_get_next(seq, v);
2706 if (!rc) {
2707 st->state = TCP_SEQ_STATE_ESTABLISHED;
2708 st->bucket = 0;
2709 st->offset = 0;
2710 rc = established_get_first(seq);
2711 }
2712 break;
2713 case TCP_SEQ_STATE_ESTABLISHED:
2714 rc = established_get_next(seq, v);
2715 break;
2716 }
2717 out:
2718 ++*pos;
2719 st->last_pos = *pos;
2720 return rc;
2721 }
2722
2723 void tcp_seq_stop(struct seq_file *seq, void *v)
2724 {
2725 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2726 struct tcp_iter_state *st = seq->private;
2727
2728 switch (st->state) {
2729 case TCP_SEQ_STATE_LISTENING:
2730 if (v != SEQ_START_TOKEN)
2731 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2732 break;
2733 case TCP_SEQ_STATE_ESTABLISHED:
2734 if (v)
2735 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2736 break;
2737 }
2738 }
2739
2740 static void get_openreq4(const struct request_sock *req,
2741 struct seq_file *f, int i)
2742 {
2743 const struct inet_request_sock *ireq = inet_rsk(req);
2744 long delta = req->rsk_timer.expires - jiffies;
2745
2746 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2747 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2748 i,
2749 ireq->ir_loc_addr,
2750 ireq->ir_num,
2751 ireq->ir_rmt_addr,
2752 ntohs(ireq->ir_rmt_port),
2753 TCP_SYN_RECV,
2754 0, 0, /* could print option size, but that is af dependent. */
2755 1, /* timers active (only the expire timer) */
2756 jiffies_delta_to_clock_t(delta),
2757 req->num_timeout,
2758 from_kuid_munged(seq_user_ns(f),
2759 sk_uid(req->rsk_listener)),
2760 0, /* non standard timer */
2761 0, /* open_requests have no inode */
2762 0,
2763 req);
2764 }
2765
2766 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2767 {
2768 int timer_active;
2769 unsigned long timer_expires;
2770 const struct tcp_sock *tp = tcp_sk(sk);
2771 const struct inet_connection_sock *icsk = inet_csk(sk);
2772 const struct inet_sock *inet = inet_sk(sk);
2773 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2774 __be32 dest = inet->inet_daddr;
2775 __be32 src = inet->inet_rcv_saddr;
2776 __u16 destp = ntohs(inet->inet_dport);
2777 __u16 srcp = ntohs(inet->inet_sport);
2778 u8 icsk_pending;
2779 int rx_queue;
2780 int state;
2781
2782 icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2783 if (icsk_pending == ICSK_TIME_RETRANS ||
2784 icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2785 icsk_pending == ICSK_TIME_LOSS_PROBE) {
2786 timer_active = 1;
2787 timer_expires = tcp_timeout_expires(sk);
2788 } else if (icsk_pending == ICSK_TIME_PROBE0) {
2789 timer_active = 4;
2790 timer_expires = tcp_timeout_expires(sk);
2791 } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
2792 timer_active = 2;
2793 timer_expires = icsk->icsk_keepalive_timer.expires;
2794 } else {
2795 timer_active = 0;
2796 timer_expires = jiffies;
2797 }
2798
2799 state = inet_sk_state_load(sk);
2800 if (state == TCP_LISTEN)
2801 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2802 else
2803 /* Because we don't lock the socket,
2804 * we might find a transient negative value.
2805 */
2806 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2807 READ_ONCE(tp->copied_seq), 0);
2808
2809 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2810 "%08X %5u %8d %llu %d %pK %lu %lu %u %u %d",
2811 i, src, srcp, dest, destp, state,
2812 READ_ONCE(tp->write_seq) - tp->snd_una,
2813 rx_queue,
2814 timer_active,
2815 jiffies_delta_to_clock_t(timer_expires - jiffies),
2816 READ_ONCE(icsk->icsk_retransmits),
2817 from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2818 READ_ONCE(icsk->icsk_probes_out),
2819 sock_i_ino(sk),
2820 refcount_read(&sk->sk_refcnt), sk,
2821 jiffies_to_clock_t(icsk->icsk_rto),
2822 jiffies_to_clock_t(icsk->icsk_ack.ato),
2823 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2824 tcp_snd_cwnd(tp),
2825 state == TCP_LISTEN ?
2826 fastopenq->max_qlen :
2827 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2828 }
2829
2830 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2831 struct seq_file *f, int i)
2832 {
2833 long delta = tw->tw_timer.expires - jiffies;
2834 __be32 dest, src;
2835 __u16 destp, srcp;
2836
2837 dest = tw->tw_daddr;
2838 src = tw->tw_rcv_saddr;
2839 destp = ntohs(tw->tw_dport);
2840 srcp = ntohs(tw->tw_sport);
2841
2842 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2843 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2844 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2845 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2846 refcount_read(&tw->tw_refcnt), tw);
2847 }
2848
2849 #define TMPSZ 150
2850
2851 static int tcp4_seq_show(struct seq_file *seq, void *v)
2852 {
2853 struct tcp_iter_state *st;
2854 struct sock *sk = v;
2855
2856 seq_setwidth(seq, TMPSZ - 1);
2857 if (v == SEQ_START_TOKEN) {
2858 seq_puts(seq, " sl local_address rem_address st tx_queue "
2859 "rx_queue tr tm->when retrnsmt uid timeout "
2860 "inode");
2861 goto out;
2862 }
2863 st = seq->private;
2864
2865 if (sk->sk_state == TCP_TIME_WAIT)
2866 get_timewait4_sock(v, seq, st->num);
2867 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2868 get_openreq4(v, seq, st->num);
2869 else
2870 get_tcp4_sock(v, seq, st->num);
2871 out:
2872 seq_pad(seq, '\n');
2873 return 0;
2874 }
2875
2876 #ifdef CONFIG_BPF_SYSCALL
2877 union bpf_tcp_iter_batch_item {
2878 struct sock *sk;
2879 __u64 cookie;
2880 };
2881
2882 struct bpf_tcp_iter_state {
2883 struct tcp_iter_state state;
2884 unsigned int cur_sk;
2885 unsigned int end_sk;
2886 unsigned int max_sk;
2887 union bpf_tcp_iter_batch_item *batch;
2888 };
2889
2890 struct bpf_iter__tcp {
2891 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2892 __bpf_md_ptr(struct sock_common *, sk_common);
2893 uid_t uid __aligned(8);
2894 };
2895
2896 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2897 struct sock_common *sk_common, uid_t uid)
2898 {
2899 struct bpf_iter__tcp ctx;
2900
2901 meta->seq_num--; /* skip SEQ_START_TOKEN */
2902 ctx.meta = meta;
2903 ctx.sk_common = sk_common;
2904 ctx.uid = uid;
2905 return bpf_iter_run_prog(prog, &ctx);
2906 }
2907
2908 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2909 {
2910 union bpf_tcp_iter_batch_item *item;
2911 unsigned int cur_sk = iter->cur_sk;
2912 __u64 cookie;
2913
2914 /* Remember the cookies of the sockets we haven't seen yet, so we can
2915 * pick up where we left off next time around.
2916 */
2917 while (cur_sk < iter->end_sk) {
2918 item = &iter->batch[cur_sk++];
2919 cookie = sock_gen_cookie(item->sk);
2920 sock_gen_put(item->sk);
2921 item->cookie = cookie;
2922 }
2923 }
2924
2925 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2926 unsigned int new_batch_sz, gfp_t flags)
2927 {
2928 union bpf_tcp_iter_batch_item *new_batch;
2929
2930 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2931 flags | __GFP_NOWARN);
2932 if (!new_batch)
2933 return -ENOMEM;
2934
2935 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
2936 kvfree(iter->batch);
2937 iter->batch = new_batch;
2938 iter->max_sk = new_batch_sz;
2939
2940 return 0;
2941 }
2942
2943 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
2944 union bpf_tcp_iter_batch_item *cookies,
2945 int n_cookies)
2946 {
2947 struct hlist_nulls_node *node;
2948 struct sock *sk;
2949 int i;
2950
2951 for (i = 0; i < n_cookies; i++) {
2952 sk = first_sk;
2953 sk_nulls_for_each_from(sk, node)
2954 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
2955 return sk;
2956 }
2957
2958 return NULL;
2959 }
2960
2961 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
2962 {
2963 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2964 struct bpf_tcp_iter_state *iter = seq->private;
2965 struct tcp_iter_state *st = &iter->state;
2966 unsigned int find_cookie = iter->cur_sk;
2967 unsigned int end_cookie = iter->end_sk;
2968 int resume_bucket = st->bucket;
2969 struct sock *sk;
2970
2971 if (end_cookie && find_cookie == end_cookie)
2972 ++st->bucket;
2973
2974 sk = listening_get_first(seq);
2975 iter->cur_sk = 0;
2976 iter->end_sk = 0;
2977
2978 if (sk && st->bucket == resume_bucket && end_cookie) {
2979 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
2980 end_cookie - find_cookie);
2981 if (!sk) {
2982 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2983 ++st->bucket;
2984 sk = listening_get_first(seq);
2985 }
2986 }
2987
2988 return sk;
2989 }
2990
2991 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
2992 {
2993 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2994 struct bpf_tcp_iter_state *iter = seq->private;
2995 struct tcp_iter_state *st = &iter->state;
2996 unsigned int find_cookie = iter->cur_sk;
2997 unsigned int end_cookie = iter->end_sk;
2998 int resume_bucket = st->bucket;
2999 struct sock *sk;
3000
3001 if (end_cookie && find_cookie == end_cookie)
3002 ++st->bucket;
3003
3004 sk = established_get_first(seq);
3005 iter->cur_sk = 0;
3006 iter->end_sk = 0;
3007
3008 if (sk && st->bucket == resume_bucket && end_cookie) {
3009 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3010 end_cookie - find_cookie);
3011 if (!sk) {
3012 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3013 ++st->bucket;
3014 sk = established_get_first(seq);
3015 }
3016 }
3017
3018 return sk;
3019 }
3020
3021 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3022 {
3023 struct bpf_tcp_iter_state *iter = seq->private;
3024 struct tcp_iter_state *st = &iter->state;
3025 struct sock *sk = NULL;
3026
3027 switch (st->state) {
3028 case TCP_SEQ_STATE_LISTENING:
3029 sk = bpf_iter_tcp_resume_listening(seq);
3030 if (sk)
3031 break;
3032 st->bucket = 0;
3033 st->state = TCP_SEQ_STATE_ESTABLISHED;
3034 fallthrough;
3035 case TCP_SEQ_STATE_ESTABLISHED:
3036 sk = bpf_iter_tcp_resume_established(seq);
3037 break;
3038 }
3039
3040 return sk;
3041 }
3042
3043 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3044 struct sock **start_sk)
3045 {
3046 struct bpf_tcp_iter_state *iter = seq->private;
3047 struct hlist_nulls_node *node;
3048 unsigned int expected = 1;
3049 struct sock *sk;
3050
3051 sock_hold(*start_sk);
3052 iter->batch[iter->end_sk++].sk = *start_sk;
3053
3054 sk = sk_nulls_next(*start_sk);
3055 *start_sk = NULL;
3056 sk_nulls_for_each_from(sk, node) {
3057 if (seq_sk_match(seq, sk)) {
3058 if (iter->end_sk < iter->max_sk) {
3059 sock_hold(sk);
3060 iter->batch[iter->end_sk++].sk = sk;
3061 } else if (!*start_sk) {
3062 /* Remember where we left off. */
3063 *start_sk = sk;
3064 }
3065 expected++;
3066 }
3067 }
3068
3069 return expected;
3070 }
3071
3072 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3073 struct sock **start_sk)
3074 {
3075 struct bpf_tcp_iter_state *iter = seq->private;
3076 struct hlist_nulls_node *node;
3077 unsigned int expected = 1;
3078 struct sock *sk;
3079
3080 sock_hold(*start_sk);
3081 iter->batch[iter->end_sk++].sk = *start_sk;
3082
3083 sk = sk_nulls_next(*start_sk);
3084 *start_sk = NULL;
3085 sk_nulls_for_each_from(sk, node) {
3086 if (seq_sk_match(seq, sk)) {
3087 if (iter->end_sk < iter->max_sk) {
3088 sock_hold(sk);
3089 iter->batch[iter->end_sk++].sk = sk;
3090 } else if (!*start_sk) {
3091 /* Remember where we left off. */
3092 *start_sk = sk;
3093 }
3094 expected++;
3095 }
3096 }
3097
3098 return expected;
3099 }
3100
3101 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3102 struct sock **start_sk)
3103 {
3104 struct bpf_tcp_iter_state *iter = seq->private;
3105 struct tcp_iter_state *st = &iter->state;
3106
3107 if (st->state == TCP_SEQ_STATE_LISTENING)
3108 return bpf_iter_tcp_listening_batch(seq, start_sk);
3109 else
3110 return bpf_iter_tcp_established_batch(seq, start_sk);
3111 }
3112
3113 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3114 {
3115 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3116 struct bpf_tcp_iter_state *iter = seq->private;
3117 struct tcp_iter_state *st = &iter->state;
3118
3119 if (st->state == TCP_SEQ_STATE_LISTENING)
3120 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3121 else
3122 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3123 }
3124
3125 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3126 {
3127 struct bpf_tcp_iter_state *iter = seq->private;
3128 unsigned int expected;
3129 struct sock *sk;
3130 int err;
3131
3132 sk = bpf_iter_tcp_resume(seq);
3133 if (!sk)
3134 return NULL; /* Done */
3135
3136 expected = bpf_iter_fill_batch(seq, &sk);
3137 if (likely(iter->end_sk == expected))
3138 goto done;
3139
3140 /* Batch size was too small. */
3141 bpf_iter_tcp_unlock_bucket(seq);
3142 bpf_iter_tcp_put_batch(iter);
3143 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3144 GFP_USER);
3145 if (err)
3146 return ERR_PTR(err);
3147
3148 sk = bpf_iter_tcp_resume(seq);
3149 if (!sk)
3150 return NULL; /* Done */
3151
3152 expected = bpf_iter_fill_batch(seq, &sk);
3153 if (likely(iter->end_sk == expected))
3154 goto done;
3155
3156 /* Batch size was still too small. Hold onto the lock while we try
3157 * again with a larger batch to make sure the current bucket's size
3158 * does not change in the meantime.
3159 */
3160 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3161 if (err) {
3162 bpf_iter_tcp_unlock_bucket(seq);
3163 return ERR_PTR(err);
3164 }
3165
3166 expected = bpf_iter_fill_batch(seq, &sk);
3167 WARN_ON_ONCE(iter->end_sk != expected);
3168 done:
3169 bpf_iter_tcp_unlock_bucket(seq);
3170 return iter->batch[0].sk;
3171 }
3172
3173 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3174 {
3175 /* bpf iter does not support lseek, so it always
3176 * continue from where it was stop()-ped.
3177 */
3178 if (*pos)
3179 return bpf_iter_tcp_batch(seq);
3180
3181 return SEQ_START_TOKEN;
3182 }
3183
3184 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3185 {
3186 struct bpf_tcp_iter_state *iter = seq->private;
3187 struct tcp_iter_state *st = &iter->state;
3188 struct sock *sk;
3189
3190 /* Whenever seq_next() is called, the iter->cur_sk is
3191 * done with seq_show(), so advance to the next sk in
3192 * the batch.
3193 */
3194 if (iter->cur_sk < iter->end_sk) {
3195 /* Keeping st->num consistent in tcp_iter_state.
3196 * bpf_iter_tcp does not use st->num.
3197 * meta.seq_num is used instead.
3198 */
3199 st->num++;
3200 sock_gen_put(iter->batch[iter->cur_sk++].sk);
3201 }
3202
3203 if (iter->cur_sk < iter->end_sk)
3204 sk = iter->batch[iter->cur_sk].sk;
3205 else
3206 sk = bpf_iter_tcp_batch(seq);
3207
3208 ++*pos;
3209 /* Keeping st->last_pos consistent in tcp_iter_state.
3210 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3211 */
3212 st->last_pos = *pos;
3213 return sk;
3214 }
3215
3216 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3217 {
3218 struct bpf_iter_meta meta;
3219 struct bpf_prog *prog;
3220 struct sock *sk = v;
3221 uid_t uid;
3222 int ret;
3223
3224 if (v == SEQ_START_TOKEN)
3225 return 0;
3226
3227 if (sk_fullsock(sk))
3228 lock_sock(sk);
3229
3230 if (unlikely(sk_unhashed(sk))) {
3231 ret = SEQ_SKIP;
3232 goto unlock;
3233 }
3234
3235 if (sk->sk_state == TCP_TIME_WAIT) {
3236 uid = 0;
3237 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3238 const struct request_sock *req = v;
3239
3240 uid = from_kuid_munged(seq_user_ns(seq),
3241 sk_uid(req->rsk_listener));
3242 } else {
3243 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3244 }
3245
3246 meta.seq = seq;
3247 prog = bpf_iter_get_info(&meta, false);
3248 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3249
3250 unlock:
3251 if (sk_fullsock(sk))
3252 release_sock(sk);
3253 return ret;
3254
3255 }
3256
3257 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3258 {
3259 struct bpf_tcp_iter_state *iter = seq->private;
3260 struct bpf_iter_meta meta;
3261 struct bpf_prog *prog;
3262
3263 if (!v) {
3264 meta.seq = seq;
3265 prog = bpf_iter_get_info(&meta, true);
3266 if (prog)
3267 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3268 }
3269
3270 if (iter->cur_sk < iter->end_sk)
3271 bpf_iter_tcp_put_batch(iter);
3272 }
3273
3274 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3275 .show = bpf_iter_tcp_seq_show,
3276 .start = bpf_iter_tcp_seq_start,
3277 .next = bpf_iter_tcp_seq_next,
3278 .stop = bpf_iter_tcp_seq_stop,
3279 };
3280 #endif
3281 static unsigned short seq_file_family(const struct seq_file *seq)
3282 {
3283 const struct tcp_seq_afinfo *afinfo;
3284
3285 #ifdef CONFIG_BPF_SYSCALL
3286 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3287 if (seq->op == &bpf_iter_tcp_seq_ops)
3288 return AF_UNSPEC;
3289 #endif
3290
3291 /* Iterated from proc fs */
3292 afinfo = pde_data(file_inode(seq->file));
3293 return afinfo->family;
3294 }
3295
3296 static const struct seq_operations tcp4_seq_ops = {
3297 .show = tcp4_seq_show,
3298 .start = tcp_seq_start,
3299 .next = tcp_seq_next,
3300 .stop = tcp_seq_stop,
3301 };
3302
3303 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3304 .family = AF_INET,
3305 };
3306
3307 static int __net_init tcp4_proc_init_net(struct net *net)
3308 {
3309 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3310 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3311 return -ENOMEM;
3312 return 0;
3313 }
3314
3315 static void __net_exit tcp4_proc_exit_net(struct net *net)
3316 {
3317 remove_proc_entry("tcp", net->proc_net);
3318 }
3319
3320 static struct pernet_operations tcp4_net_ops = {
3321 .init = tcp4_proc_init_net,
3322 .exit = tcp4_proc_exit_net,
3323 };
3324
3325 int __init tcp4_proc_init(void)
3326 {
3327 return register_pernet_subsys(&tcp4_net_ops);
3328 }
3329
3330 void tcp4_proc_exit(void)
3331 {
3332 unregister_pernet_subsys(&tcp4_net_ops);
3333 }
3334 #endif /* CONFIG_PROC_FS */
3335
3336 struct proto tcp_prot = {
3337 .name = "TCP",
3338 .owner = THIS_MODULE,
3339 .close = tcp_close,
3340 .pre_connect = tcp_v4_pre_connect,
3341 .connect = tcp_v4_connect,
3342 .disconnect = tcp_disconnect,
3343 .accept = inet_csk_accept,
3344 .ioctl = tcp_ioctl,
3345 .init = tcp_v4_init_sock,
3346 .destroy = tcp_v4_destroy_sock,
3347 .shutdown = tcp_shutdown,
3348 .setsockopt = tcp_setsockopt,
3349 .getsockopt = tcp_getsockopt,
3350 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3351 .keepalive = tcp_set_keepalive,
3352 .recvmsg = tcp_recvmsg,
3353 .sendmsg = tcp_sendmsg,
3354 .splice_eof = tcp_splice_eof,
3355 .backlog_rcv = tcp_v4_do_rcv,
3356 .release_cb = tcp_release_cb,
3357 .hash = inet_hash,
3358 .unhash = inet_unhash,
3359 .get_port = inet_csk_get_port,
3360 .put_port = inet_put_port,
3361 #ifdef CONFIG_BPF_SYSCALL
3362 .psock_update_sk_prot = tcp_bpf_update_proto,
3363 #endif
3364 .enter_memory_pressure = tcp_enter_memory_pressure,
3365 .leave_memory_pressure = tcp_leave_memory_pressure,
3366 .stream_memory_free = tcp_stream_memory_free,
3367 .sockets_allocated = &tcp_sockets_allocated,
3368
3369 .memory_allocated = &net_aligned_data.tcp_memory_allocated,
3370 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3371
3372 .memory_pressure = &tcp_memory_pressure,
3373 .sysctl_mem = sysctl_tcp_mem,
3374 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3375 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3376 .max_header = MAX_TCP_HEADER,
3377 .obj_size = sizeof(struct tcp_sock),
3378 .freeptr_offset = offsetof(struct tcp_sock,
3379 inet_conn.icsk_inet.sk.sk_freeptr),
3380 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3381 .twsk_prot = &tcp_timewait_sock_ops,
3382 .rsk_prot = &tcp_request_sock_ops,
3383 .h.hashinfo = NULL,
3384 .no_autobind = true,
3385 .diag_destroy = tcp_abort,
3386 };
3387 EXPORT_SYMBOL(tcp_prot);
3388
3389 static void __net_exit tcp_sk_exit(struct net *net)
3390 {
3391 if (net->ipv4.tcp_congestion_control)
3392 bpf_module_put(net->ipv4.tcp_congestion_control,
3393 net->ipv4.tcp_congestion_control->owner);
3394 }
3395
3396 static void __net_init tcp_set_hashinfo(struct net *net)
3397 {
3398 struct inet_hashinfo *hinfo;
3399 unsigned int ehash_entries;
3400 struct net *old_net;
3401
3402 if (net_eq(net, &init_net))
3403 goto fallback;
3404
3405 old_net = current->nsproxy->net_ns;
3406 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3407 if (!ehash_entries)
3408 goto fallback;
3409
3410 ehash_entries = roundup_pow_of_two(ehash_entries);
3411 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3412 if (!hinfo) {
3413 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3414 "for a netns, fallback to the global one\n",
3415 ehash_entries);
3416 fallback:
3417 hinfo = &tcp_hashinfo;
3418 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3419 }
3420
3421 net->ipv4.tcp_death_row.hashinfo = hinfo;
3422 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3423 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3424 }
3425
3426 static int __net_init tcp_sk_init(struct net *net)
3427 {
3428 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3429 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3430 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3431 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3432
3433 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3434 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3435 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3436 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3437 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3438
3439 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3440 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3441 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3442
3443 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3444 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3445 net->ipv4.sysctl_tcp_syncookies = 1;
3446 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3447 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3448 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3449 net->ipv4.sysctl_tcp_orphan_retries = 0;
3450 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3451 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3452 net->ipv4.sysctl_tcp_tw_reuse = 2;
3453 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3454 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3455
3456 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3457 tcp_set_hashinfo(net);
3458
3459 net->ipv4.sysctl_tcp_sack = 1;
3460 net->ipv4.sysctl_tcp_window_scaling = 1;
3461 net->ipv4.sysctl_tcp_timestamps = 1;
3462 net->ipv4.sysctl_tcp_early_retrans = 3;
3463 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3464 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3465 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3466 net->ipv4.sysctl_tcp_max_reordering = 300;
3467 net->ipv4.sysctl_tcp_dsack = 1;
3468 net->ipv4.sysctl_tcp_app_win = 31;
3469 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3470 net->ipv4.sysctl_tcp_frto = 2;
3471 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3472 net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3473 /* This limits the percentage of the congestion window which we
3474 * will allow a single TSO frame to consume. Building TSO frames
3475 * which are too large can cause TCP streams to be bursty.
3476 */
3477 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3478 /* Default TSQ limit of 4 MB */
3479 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3480
3481 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3482 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3483
3484 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3485 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3486 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3487 net->ipv4.sysctl_tcp_autocorking = 1;
3488 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3489 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3490 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3491 if (net != &init_net) {
3492 memcpy(net->ipv4.sysctl_tcp_rmem,
3493 init_net.ipv4.sysctl_tcp_rmem,
3494 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3495 memcpy(net->ipv4.sysctl_tcp_wmem,
3496 init_net.ipv4.sysctl_tcp_wmem,
3497 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3498 }
3499 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3500 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
3501 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3502 net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
3503 net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3504 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3505 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3506 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3507
3508 /* Set default values for PLB */
3509 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3510 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3511 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3512 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3513 /* Default congestion threshold for PLB to mark a round is 50% */
3514 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3515
3516 /* Reno is always built in */
3517 if (!net_eq(net, &init_net) &&
3518 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3519 init_net.ipv4.tcp_congestion_control->owner))
3520 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3521 else
3522 net->ipv4.tcp_congestion_control = &tcp_reno;
3523
3524 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3525 net->ipv4.sysctl_tcp_shrink_window = 0;
3526
3527 net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3528 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3529 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3530
3531 return 0;
3532 }
3533
3534 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3535 {
3536 struct net *net;
3537
3538 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3539 * and failed setup_net error unwinding path are serialized.
3540 *
3541 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3542 * net_exit_list, the thread that dismantles a particular twsk must
3543 * do so without other thread progressing to refcount_dec_and_test() of
3544 * tcp_death_row.tw_refcount.
3545 */
3546 mutex_lock(&tcp_exit_batch_mutex);
3547
3548 tcp_twsk_purge(net_exit_list);
3549
3550 list_for_each_entry(net, net_exit_list, exit_list) {
3551 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3552 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3553 tcp_fastopen_ctx_destroy(net);
3554 }
3555
3556 mutex_unlock(&tcp_exit_batch_mutex);
3557 }
3558
3559 static struct pernet_operations __net_initdata tcp_sk_ops = {
3560 .init = tcp_sk_init,
3561 .exit = tcp_sk_exit,
3562 .exit_batch = tcp_sk_exit_batch,
3563 };
3564
3565 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3566 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3567 struct sock_common *sk_common, uid_t uid)
3568
3569 #define INIT_BATCH_SZ 16
3570
3571 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3572 {
3573 struct bpf_tcp_iter_state *iter = priv_data;
3574 int err;
3575
3576 err = bpf_iter_init_seq_net(priv_data, aux);
3577 if (err)
3578 return err;
3579
3580 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3581 if (err) {
3582 bpf_iter_fini_seq_net(priv_data);
3583 return err;
3584 }
3585
3586 return 0;
3587 }
3588
3589 static void bpf_iter_fini_tcp(void *priv_data)
3590 {
3591 struct bpf_tcp_iter_state *iter = priv_data;
3592
3593 bpf_iter_fini_seq_net(priv_data);
3594 kvfree(iter->batch);
3595 }
3596
3597 static const struct bpf_iter_seq_info tcp_seq_info = {
3598 .seq_ops = &bpf_iter_tcp_seq_ops,
3599 .init_seq_private = bpf_iter_init_tcp,
3600 .fini_seq_private = bpf_iter_fini_tcp,
3601 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3602 };
3603
3604 static const struct bpf_func_proto *
3605 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3606 const struct bpf_prog *prog)
3607 {
3608 switch (func_id) {
3609 case BPF_FUNC_setsockopt:
3610 return &bpf_sk_setsockopt_proto;
3611 case BPF_FUNC_getsockopt:
3612 return &bpf_sk_getsockopt_proto;
3613 default:
3614 return NULL;
3615 }
3616 }
3617
3618 static struct bpf_iter_reg tcp_reg_info = {
3619 .target = "tcp",
3620 .ctx_arg_info_size = 1,
3621 .ctx_arg_info = {
3622 { offsetof(struct bpf_iter__tcp, sk_common),
3623 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3624 },
3625 .get_func_proto = bpf_iter_tcp_get_func_proto,
3626 .seq_info = &tcp_seq_info,
3627 };
3628
3629 static void __init bpf_iter_register(void)
3630 {
3631 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3632 if (bpf_iter_reg_target(&tcp_reg_info))
3633 pr_warn("Warning: could not register bpf iterator tcp\n");
3634 }
3635
3636 #endif
3637
3638 void __init tcp_v4_init(void)
3639 {
3640 int cpu, res;
3641
3642 for_each_possible_cpu(cpu) {
3643 struct sock *sk;
3644
3645 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3646 IPPROTO_TCP, &init_net);
3647 if (res)
3648 panic("Failed to create the TCP control socket.\n");
3649 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3650
3651 /* Please enforce IP_DF and IPID==0 for RST and
3652 * ACK sent in SYN-RECV and TIME-WAIT state.
3653 */
3654 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3655
3656 sk->sk_clockid = CLOCK_MONOTONIC;
3657
3658 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3659 }
3660 if (register_pernet_subsys(&tcp_sk_ops))
3661 panic("Failed to create the TCP control socket.\n");
3662
3663 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3664 bpf_iter_register();
3665 #endif
3666 }
3667