1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/fips.h>
57 #include <linux/jhash.h>
58 #include <linux/init.h>
59 #include <linux/times.h>
60 #include <linux/slab.h>
61 #include <linux/sched.h>
62 #include <linux/sock_diag.h>
63
64 #include <net/aligned_data.h>
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/tcp_ecn.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/inet_ecn.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/busy_poll.h>
78 #include <net/rstreason.h>
79 #include <net/psp.h>
80
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86 #include <linux/inetdevice.h>
87 #include <linux/btf_ids.h>
88 #include <linux/skbuff_ref.h>
89
90 #include <crypto/md5.h>
91 #include <crypto/utils.h>
92
93 #include <trace/events/tcp.h>
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
97 __be32 daddr, __be32 saddr, const struct tcphdr *th);
98 #endif
99
100 struct inet_hashinfo tcp_hashinfo;
101
102 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
103 .bh_lock = INIT_LOCAL_LOCK(bh_lock),
104 };
105
106 static DEFINE_MUTEX(tcp_exit_batch_mutex);
107
108 static union tcp_seq_and_ts_off
tcp_v4_init_seq_and_ts_off(const struct net * net,const struct sk_buff * skb)109 tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
110 {
111 return secure_tcp_seq_and_ts_off(net,
112 ip_hdr(skb)->daddr,
113 ip_hdr(skb)->saddr,
114 tcp_hdr(skb)->dest,
115 tcp_hdr(skb)->source);
116 }
117
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
119 {
120 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
121 const struct inet_timewait_sock *tw = inet_twsk(sktw);
122 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
123 struct tcp_sock *tp = tcp_sk(sk);
124 int ts_recent_stamp;
125 u32 reuse_thresh;
126
127 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
128 reuse = 0;
129
130 if (reuse == 2) {
131 /* Still does not detect *everything* that goes through
132 * lo, since we require a loopback src or dst address
133 * or direct binding to 'lo' interface.
134 */
135 bool loopback = false;
136 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
137 loopback = true;
138 #if IS_ENABLED(CONFIG_IPV6)
139 if (tw->tw_family == AF_INET6) {
140 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
142 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
144 loopback = true;
145 } else
146 #endif
147 {
148 if (ipv4_is_loopback(tw->tw_daddr) ||
149 ipv4_is_loopback(tw->tw_rcv_saddr))
150 loopback = true;
151 }
152 if (!loopback)
153 reuse = 0;
154 }
155
156 /* With PAWS, it is safe from the viewpoint
157 of data integrity. Even without PAWS it is safe provided sequence
158 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
159
160 Actually, the idea is close to VJ's one, only timestamp cache is
161 held not per host, but per port pair and TW bucket is used as state
162 holder.
163
164 If TW bucket has been already destroyed we fall back to VJ's scheme
165 and use initial timestamp retrieved from peer table.
166 */
167 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
168 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
169 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
170 if (ts_recent_stamp &&
171 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
172 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
173 * and releasing the bucket lock.
174 */
175 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
176 return 0;
177
178 /* In case of repair and re-using TIME-WAIT sockets we still
179 * want to be sure that it is safe as above but honor the
180 * sequence numbers and time stamps set as part of the repair
181 * process.
182 *
183 * Without this check re-using a TIME-WAIT socket with TCP
184 * repair would accumulate a -1 on the repair assigned
185 * sequence number. The first time it is reused the sequence
186 * is -1, the second time -2, etc. This fixes that issue
187 * without appearing to create any others.
188 */
189 if (likely(!tp->repair)) {
190 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
191
192 if (!seq)
193 seq = 1;
194 WRITE_ONCE(tp->write_seq, seq);
195 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
196 tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
197 }
198
199 return 1;
200 }
201
202 return 0;
203 }
204 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
205
tcp_v4_pre_connect(struct sock * sk,struct sockaddr_unsized * uaddr,int addr_len)206 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
207 int addr_len)
208 {
209 /* This check is replicated from tcp_v4_connect() and intended to
210 * prevent BPF program called below from accessing bytes that are out
211 * of the bound specified by user in addr_len.
212 */
213 if (addr_len < sizeof(struct sockaddr_in))
214 return -EINVAL;
215
216 sock_owned_by_me(sk);
217
218 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
219 }
220
221 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr_unsized * uaddr,int addr_len)222 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
223 {
224 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
225 struct inet_timewait_death_row *tcp_death_row;
226 struct inet_sock *inet = inet_sk(sk);
227 struct tcp_sock *tp = tcp_sk(sk);
228 struct ip_options_rcu *inet_opt;
229 struct net *net = sock_net(sk);
230 __be16 orig_sport, orig_dport;
231 __be32 daddr, nexthop;
232 struct flowi4 *fl4;
233 struct rtable *rt;
234 int err;
235
236 if (addr_len < sizeof(struct sockaddr_in))
237 return -EINVAL;
238
239 if (usin->sin_family != AF_INET)
240 return -EAFNOSUPPORT;
241
242 nexthop = daddr = usin->sin_addr.s_addr;
243 inet_opt = rcu_dereference_protected(inet->inet_opt,
244 lockdep_sock_is_held(sk));
245 if (inet_opt && inet_opt->opt.srr) {
246 if (!daddr)
247 return -EINVAL;
248 nexthop = inet_opt->opt.faddr;
249 }
250
251 orig_sport = inet->inet_sport;
252 orig_dport = usin->sin_port;
253 fl4 = &inet->cork.fl.u.ip4;
254 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
255 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
256 orig_dport, sk);
257 if (IS_ERR(rt)) {
258 err = PTR_ERR(rt);
259 if (err == -ENETUNREACH)
260 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
261 return err;
262 }
263
264 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
265 ip_rt_put(rt);
266 return -ENETUNREACH;
267 }
268
269 if (!inet_opt || !inet_opt->opt.srr)
270 daddr = fl4->daddr;
271
272 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
273
274 if (!inet->inet_saddr) {
275 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
276 if (err) {
277 ip_rt_put(rt);
278 return err;
279 }
280 } else {
281 sk_rcv_saddr_set(sk, inet->inet_saddr);
282 }
283
284 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
285 /* Reset inherited state */
286 tp->rx_opt.ts_recent = 0;
287 tp->rx_opt.ts_recent_stamp = 0;
288 if (likely(!tp->repair))
289 WRITE_ONCE(tp->write_seq, 0);
290 }
291
292 inet->inet_dport = usin->sin_port;
293 sk_daddr_set(sk, daddr);
294
295 inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
296 if (inet_opt)
297 inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
298
299 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
300
301 /* Socket identity is still unknown (sport may be zero).
302 * However we set state to SYN-SENT and not releasing socket
303 * lock select source port, enter ourselves into the hash tables and
304 * complete initialization after this.
305 */
306 tcp_set_state(sk, TCP_SYN_SENT);
307 err = inet_hash_connect(tcp_death_row, sk);
308 if (err)
309 goto failure;
310
311 sk_set_txhash(sk);
312
313 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
314 inet->inet_sport, inet->inet_dport, sk);
315 if (IS_ERR(rt)) {
316 err = PTR_ERR(rt);
317 rt = NULL;
318 goto failure;
319 }
320 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
321 /* OK, now commit destination to socket. */
322 sk->sk_gso_type = SKB_GSO_TCPV4;
323 sk_setup_caps(sk, &rt->dst);
324 rt = NULL;
325
326 if (likely(!tp->repair)) {
327 union tcp_seq_and_ts_off st;
328
329 st = secure_tcp_seq_and_ts_off(net,
330 inet->inet_saddr,
331 inet->inet_daddr,
332 inet->inet_sport,
333 usin->sin_port);
334 if (!tp->write_seq)
335 WRITE_ONCE(tp->write_seq, st.seq);
336 WRITE_ONCE(tp->tsoffset, st.ts_off);
337 }
338
339 atomic_set(&inet->inet_id, get_random_u16());
340
341 if (tcp_fastopen_defer_connect(sk, &err))
342 return err;
343 if (err)
344 goto failure;
345
346 err = tcp_connect(sk);
347
348 if (err)
349 goto failure;
350
351 return 0;
352
353 failure:
354 /*
355 * This unhashes the socket and releases the local port,
356 * if necessary.
357 */
358 tcp_set_state(sk, TCP_CLOSE);
359 inet_bhash2_reset_saddr(sk);
360 ip_rt_put(rt);
361 sk->sk_route_caps = 0;
362 inet->inet_dport = 0;
363 return err;
364 }
365 EXPORT_IPV6_MOD(tcp_v4_connect);
366
367 /*
368 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
369 * It can be called through tcp_release_cb() if socket was owned by user
370 * at the time tcp_v4_err() was called to handle ICMP message.
371 */
tcp_v4_mtu_reduced(struct sock * sk)372 void tcp_v4_mtu_reduced(struct sock *sk)
373 {
374 struct inet_sock *inet = inet_sk(sk);
375 struct dst_entry *dst;
376 u32 mtu, dmtu;
377
378 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
379 return;
380 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
381 dst = inet_csk_update_pmtu(sk, mtu);
382 if (!dst)
383 return;
384
385 /* Something is about to be wrong... Remember soft error
386 * for the case, if this connection will not able to recover.
387 */
388 dmtu = dst4_mtu(dst);
389 if (mtu < dmtu && ip_dont_fragment(sk, dst))
390 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
391
392 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
393 ip_sk_accept_pmtu(sk) &&
394 inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
395 tcp_sync_mss(sk, dmtu);
396
397 /* Resend the TCP packet because it's
398 * clear that the old packet has been
399 * dropped. This is the new "fast" path mtu
400 * discovery.
401 */
402 tcp_simple_retransmit(sk);
403 } /* else let the usual retransmit timer handle it */
404 }
405 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
406
do_redirect(struct sk_buff * skb,struct sock * sk)407 static void do_redirect(struct sk_buff *skb, struct sock *sk)
408 {
409 struct dst_entry *dst = __sk_dst_check(sk, 0);
410
411 if (dst)
412 dst->ops->redirect(dst, sk, skb);
413 }
414
415
416 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)417 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
418 {
419 struct request_sock *req = inet_reqsk(sk);
420 struct net *net = sock_net(sk);
421
422 /* ICMPs are not backlogged, hence we cannot get
423 * an established socket here.
424 */
425 if (seq != tcp_rsk(req)->snt_isn) {
426 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
427 } else if (abort) {
428 /*
429 * Still in SYN_RECV, just remove it silently.
430 * There is no good way to pass the error to the newly
431 * created socket, and POSIX does not want network
432 * errors returned from accept().
433 */
434 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
435 tcp_listendrop(req->rsk_listener);
436 }
437 reqsk_put(req);
438 }
439 EXPORT_IPV6_MOD(tcp_req_err);
440
441 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)442 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
443 {
444 struct inet_connection_sock *icsk = inet_csk(sk);
445 struct tcp_sock *tp = tcp_sk(sk);
446 struct sk_buff *skb;
447 s32 remaining;
448 u32 delta_us;
449
450 if (sock_owned_by_user(sk))
451 return;
452
453 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
454 !icsk->icsk_backoff)
455 return;
456
457 skb = tcp_rtx_queue_head(sk);
458 if (WARN_ON_ONCE(!skb))
459 return;
460
461 icsk->icsk_backoff--;
462 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
463 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
464
465 tcp_mstamp_refresh(tp);
466 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
467 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
468
469 if (remaining > 0) {
470 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
471 } else {
472 /* RTO revert clocked out retransmission.
473 * Will retransmit now.
474 */
475 tcp_retransmit_timer(sk);
476 }
477 }
478 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
479
480 /*
481 * This routine is called by the ICMP module when it gets some
482 * sort of error condition. If err < 0 then the socket should
483 * be closed and the error returned to the user. If err > 0
484 * it's just the icmp type << 8 | icmp code. After adjustment
485 * header points to the first 8 bytes of the tcp header. We need
486 * to find the appropriate port.
487 *
488 * The locking strategy used here is very "optimistic". When
489 * someone else accesses the socket the ICMP is just dropped
490 * and for some paths there is no check at all.
491 * A more general error queue to queue errors for later handling
492 * is probably better.
493 *
494 */
495
tcp_v4_err(struct sk_buff * skb,u32 info)496 int tcp_v4_err(struct sk_buff *skb, u32 info)
497 {
498 const struct iphdr *iph = (const struct iphdr *)skb->data;
499 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
500 struct net *net = dev_net_rcu(skb->dev);
501 const int type = icmp_hdr(skb)->type;
502 const int code = icmp_hdr(skb)->code;
503 struct request_sock *fastopen;
504 struct tcp_sock *tp;
505 u32 seq, snd_una;
506 struct sock *sk;
507 int err;
508
509 sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
510 ntohs(th->source), inet_iif(skb), 0);
511 if (!sk) {
512 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
513 return -ENOENT;
514 }
515 if (sk->sk_state == TCP_TIME_WAIT) {
516 /* To increase the counter of ignored icmps for TCP-AO */
517 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
518 inet_twsk_put(inet_twsk(sk));
519 return 0;
520 }
521 seq = ntohl(th->seq);
522 if (sk->sk_state == TCP_NEW_SYN_RECV) {
523 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
524 type == ICMP_TIME_EXCEEDED ||
525 (type == ICMP_DEST_UNREACH &&
526 (code == ICMP_NET_UNREACH ||
527 code == ICMP_HOST_UNREACH)));
528 return 0;
529 }
530
531 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
532 sock_put(sk);
533 return 0;
534 }
535
536 bh_lock_sock(sk);
537 /* If too many ICMPs get dropped on busy
538 * servers this needs to be solved differently.
539 * We do take care of PMTU discovery (RFC1191) special case :
540 * we can receive locally generated ICMP messages while socket is held.
541 */
542 if (sock_owned_by_user(sk)) {
543 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
544 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
545 }
546 if (sk->sk_state == TCP_CLOSE)
547 goto out;
548
549 if (static_branch_unlikely(&ip4_min_ttl)) {
550 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
551 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
552 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
553 goto out;
554 }
555 }
556
557 tp = tcp_sk(sk);
558 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
559 fastopen = rcu_dereference(tp->fastopen_rsk);
560 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
561 if (sk->sk_state != TCP_LISTEN &&
562 !between(seq, snd_una, tp->snd_nxt)) {
563 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
564 goto out;
565 }
566
567 switch (type) {
568 case ICMP_REDIRECT:
569 if (!sock_owned_by_user(sk))
570 do_redirect(skb, sk);
571 goto out;
572 case ICMP_SOURCE_QUENCH:
573 /* Just silently ignore these. */
574 goto out;
575 case ICMP_PARAMETERPROB:
576 err = EPROTO;
577 break;
578 case ICMP_DEST_UNREACH:
579 if (code > NR_ICMP_UNREACH)
580 goto out;
581
582 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
583 /* We are not interested in TCP_LISTEN and open_requests
584 * (SYN-ACKs send out by Linux are always <576bytes so
585 * they should go through unfragmented).
586 */
587 if (sk->sk_state == TCP_LISTEN)
588 goto out;
589
590 WRITE_ONCE(tp->mtu_info, info);
591 if (!sock_owned_by_user(sk)) {
592 tcp_v4_mtu_reduced(sk);
593 } else {
594 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
595 sock_hold(sk);
596 }
597 goto out;
598 }
599
600 err = icmp_err_convert[code].errno;
601 /* check if this ICMP message allows revert of backoff.
602 * (see RFC 6069)
603 */
604 if (!fastopen &&
605 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
606 tcp_ld_RTO_revert(sk, seq);
607 break;
608 case ICMP_TIME_EXCEEDED:
609 err = EHOSTUNREACH;
610 break;
611 default:
612 goto out;
613 }
614
615 switch (sk->sk_state) {
616 case TCP_SYN_SENT:
617 case TCP_SYN_RECV:
618 /* Only in fast or simultaneous open. If a fast open socket is
619 * already accepted it is treated as a connected one below.
620 */
621 if (fastopen && !fastopen->sk)
622 break;
623
624 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
625
626 if (!sock_owned_by_user(sk))
627 tcp_done_with_error(sk, err);
628 else
629 WRITE_ONCE(sk->sk_err_soft, err);
630 goto out;
631 }
632
633 /* If we've already connected we will keep trying
634 * until we time out, or the user gives up.
635 *
636 * rfc1122 4.2.3.9 allows to consider as hard errors
637 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
638 * but it is obsoleted by pmtu discovery).
639 *
640 * Note, that in modern internet, where routing is unreliable
641 * and in each dark corner broken firewalls sit, sending random
642 * errors ordered by their masters even this two messages finally lose
643 * their original sense (even Linux sends invalid PORT_UNREACHs)
644 *
645 * Now we are in compliance with RFCs.
646 * --ANK (980905)
647 */
648
649 if (!sock_owned_by_user(sk) &&
650 inet_test_bit(RECVERR, sk)) {
651 WRITE_ONCE(sk->sk_err, err);
652 sk_error_report(sk);
653 } else { /* Only an error on timeout */
654 WRITE_ONCE(sk->sk_err_soft, err);
655 }
656
657 out:
658 bh_unlock_sock(sk);
659 sock_put(sk);
660 return 0;
661 }
662
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)663 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
664 {
665 struct tcphdr *th = tcp_hdr(skb);
666
667 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
668 skb->csum_start = skb_transport_header(skb) - skb->head;
669 skb->csum_offset = offsetof(struct tcphdr, check);
670 }
671
672 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)673 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
674 {
675 const struct inet_sock *inet = inet_sk(sk);
676
677 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
678 }
679 EXPORT_IPV6_MOD(tcp_v4_send_check);
680
681 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
682
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])683 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
684 const struct tcp_ao_hdr *aoh,
685 struct ip_reply_arg *arg, struct tcphdr *reply,
686 __be32 reply_options[REPLY_OPTIONS_LEN])
687 {
688 #ifdef CONFIG_TCP_AO
689 int sdif = tcp_v4_sdif(skb);
690 int dif = inet_iif(skb);
691 int l3index = sdif ? dif : 0;
692 bool allocated_traffic_key;
693 struct tcp_ao_key *key;
694 char *traffic_key;
695 bool drop = true;
696 u32 ao_sne = 0;
697 u8 keyid;
698
699 rcu_read_lock();
700 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
701 &key, &traffic_key, &allocated_traffic_key,
702 &keyid, &ao_sne))
703 goto out;
704
705 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
706 (aoh->rnext_keyid << 8) | keyid);
707 arg->iov[0].iov_len += tcp_ao_len_aligned(key);
708 reply->doff = arg->iov[0].iov_len / 4;
709
710 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
711 key, traffic_key,
712 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
713 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
714 reply, ao_sne))
715 goto out;
716 drop = false;
717 out:
718 rcu_read_unlock();
719 if (allocated_traffic_key)
720 kfree(traffic_key);
721 return drop;
722 #else
723 return true;
724 #endif
725 }
726
727 /*
728 * This routine will send an RST to the other tcp.
729 *
730 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
731 * for reset.
732 * Answer: if a packet caused RST, it is not for a socket
733 * existing in our system, if it is matched to a socket,
734 * it is just duplicate segment or bug in other side's TCP.
735 * So that we build reply only basing on parameters
736 * arrived with segment.
737 * Exception: precedence violation. We do not implement it in any case.
738 */
739
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)740 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
741 enum sk_rst_reason reason)
742 {
743 const struct tcphdr *th = tcp_hdr(skb);
744 struct {
745 struct tcphdr th;
746 __be32 opt[REPLY_OPTIONS_LEN];
747 } rep;
748 const __u8 *md5_hash_location = NULL;
749 const struct tcp_ao_hdr *aoh;
750 struct ip_reply_arg arg;
751 #ifdef CONFIG_TCP_MD5SIG
752 struct tcp_md5sig_key *key = NULL;
753 unsigned char newhash[16];
754 struct sock *sk1 = NULL;
755 #endif
756 u64 transmit_time = 0;
757 struct sock *ctl_sk;
758 struct net *net;
759 u32 txhash = 0;
760
761 /* Never send a reset in response to a reset. */
762 if (th->rst)
763 return;
764
765 /* If sk not NULL, it means we did a successful lookup and incoming
766 * route had to be correct. prequeue might have dropped our dst.
767 */
768 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
769 return;
770
771 /* Swap the send and the receive. */
772 memset(&rep, 0, sizeof(rep));
773 rep.th.dest = th->source;
774 rep.th.source = th->dest;
775 rep.th.doff = sizeof(struct tcphdr) / 4;
776 rep.th.rst = 1;
777
778 if (th->ack) {
779 rep.th.seq = th->ack_seq;
780 } else {
781 rep.th.ack = 1;
782 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
783 skb->len - (th->doff << 2));
784 }
785
786 memset(&arg, 0, sizeof(arg));
787 arg.iov[0].iov_base = (unsigned char *)&rep;
788 arg.iov[0].iov_len = sizeof(rep.th);
789
790 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
791
792 /* Invalid TCP option size or twice included auth */
793 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
794 return;
795
796 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
797 return;
798
799 #ifdef CONFIG_TCP_MD5SIG
800 rcu_read_lock();
801 if (sk && sk_fullsock(sk)) {
802 const union tcp_md5_addr *addr;
803 int l3index;
804
805 /* sdif set, means packet ingressed via a device
806 * in an L3 domain and inet_iif is set to it.
807 */
808 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
809 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
810 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
811 } else if (md5_hash_location) {
812 const union tcp_md5_addr *addr;
813 int sdif = tcp_v4_sdif(skb);
814 int dif = inet_iif(skb);
815 int l3index;
816
817 /*
818 * active side is lost. Try to find listening socket through
819 * source port, and then find md5 key through listening socket.
820 * we are not loose security here:
821 * Incoming packet is checked with md5 hash with finding key,
822 * no RST generated if md5 hash doesn't match.
823 */
824 sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
825 th->source, ip_hdr(skb)->daddr,
826 ntohs(th->source), dif, sdif);
827 /* don't send rst if it can't find key */
828 if (!sk1)
829 goto out;
830
831 /* sdif set, means packet ingressed via a device
832 * in an L3 domain and dif is set to it.
833 */
834 l3index = sdif ? dif : 0;
835 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
836 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
837 if (!key)
838 goto out;
839
840 tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
841 if (crypto_memneq(md5_hash_location, newhash, 16))
842 goto out;
843 }
844
845 if (key) {
846 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
847 (TCPOPT_NOP << 16) |
848 (TCPOPT_MD5SIG << 8) |
849 TCPOLEN_MD5SIG);
850 /* Update length and the length the header thinks exists */
851 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
852 rep.th.doff = arg.iov[0].iov_len / 4;
853
854 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
855 key, ip_hdr(skb)->saddr,
856 ip_hdr(skb)->daddr, &rep.th);
857 }
858 #endif
859 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
860 if (rep.opt[0] == 0) {
861 __be32 mrst = mptcp_reset_option(skb);
862
863 if (mrst) {
864 rep.opt[0] = mrst;
865 arg.iov[0].iov_len += sizeof(mrst);
866 rep.th.doff = arg.iov[0].iov_len / 4;
867 }
868 }
869
870 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
871 ip_hdr(skb)->saddr, /* XXX */
872 arg.iov[0].iov_len, IPPROTO_TCP, 0);
873 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
874 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
875
876 /* When socket is gone, all binding information is lost.
877 * routing might fail in this case. No choice here, if we choose to force
878 * input interface, we will misroute in case of asymmetric route.
879 */
880 if (sk)
881 arg.bound_dev_if = sk->sk_bound_dev_if;
882
883 trace_tcp_send_reset(sk, skb, reason);
884
885 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
886 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
887
888 /* ECN bits of TW reset are cleared */
889 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
890 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
891 local_bh_disable();
892 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
893 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
894
895 sock_net_set(ctl_sk, net);
896 if (sk) {
897 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
898 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
899 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
900 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
901 transmit_time = tcp_transmit_time(sk);
902 xfrm_sk_clone_policy(ctl_sk, sk);
903 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
904 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
905 } else {
906 ctl_sk->sk_mark = 0;
907 ctl_sk->sk_priority = 0;
908 }
909 ip_send_unicast_reply(ctl_sk, sk,
910 skb, &TCP_SKB_CB(skb)->header.h4.opt,
911 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
912 &arg, arg.iov[0].iov_len,
913 transmit_time, txhash);
914
915 xfrm_sk_free_policy(ctl_sk);
916 sock_net_set(ctl_sk, &init_net);
917 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
918 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
919 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
920 local_bh_enable();
921
922 #ifdef CONFIG_TCP_MD5SIG
923 out:
924 rcu_read_unlock();
925 #endif
926 }
927
928 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
929 outside socket context is ugly, certainly. What can I do?
930 */
931
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)932 static void tcp_v4_send_ack(const struct sock *sk,
933 struct sk_buff *skb, u32 seq, u32 ack,
934 u32 win, u32 tsval, u32 tsecr, int oif,
935 struct tcp_key *key,
936 int reply_flags, u8 tos, u32 txhash)
937 {
938 const struct tcphdr *th = tcp_hdr(skb);
939 struct {
940 struct tcphdr th;
941 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
942 } rep;
943 struct net *net = sock_net(sk);
944 struct ip_reply_arg arg;
945 struct sock *ctl_sk;
946 u64 transmit_time;
947
948 memset(&rep.th, 0, sizeof(struct tcphdr));
949 memset(&arg, 0, sizeof(arg));
950
951 arg.iov[0].iov_base = (unsigned char *)&rep;
952 arg.iov[0].iov_len = sizeof(rep.th);
953 if (tsecr) {
954 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
955 (TCPOPT_TIMESTAMP << 8) |
956 TCPOLEN_TIMESTAMP);
957 rep.opt[1] = htonl(tsval);
958 rep.opt[2] = htonl(tsecr);
959 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
960 }
961
962 /* Swap the send and the receive. */
963 rep.th.dest = th->source;
964 rep.th.source = th->dest;
965 rep.th.doff = arg.iov[0].iov_len / 4;
966 rep.th.seq = htonl(seq);
967 rep.th.ack_seq = htonl(ack);
968 rep.th.ack = 1;
969 rep.th.window = htons(win);
970
971 #ifdef CONFIG_TCP_MD5SIG
972 if (tcp_key_is_md5(key)) {
973 int offset = (tsecr) ? 3 : 0;
974
975 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
976 (TCPOPT_NOP << 16) |
977 (TCPOPT_MD5SIG << 8) |
978 TCPOLEN_MD5SIG);
979 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
980 rep.th.doff = arg.iov[0].iov_len/4;
981
982 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
983 key->md5_key, ip_hdr(skb)->saddr,
984 ip_hdr(skb)->daddr, &rep.th);
985 }
986 #endif
987 #ifdef CONFIG_TCP_AO
988 if (tcp_key_is_ao(key)) {
989 int offset = (tsecr) ? 3 : 0;
990
991 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
992 (tcp_ao_len(key->ao_key) << 16) |
993 (key->ao_key->sndid << 8) |
994 key->rcv_next);
995 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
996 rep.th.doff = arg.iov[0].iov_len / 4;
997
998 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
999 key->ao_key, key->traffic_key,
1000 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1001 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1002 &rep.th, key->sne);
1003 }
1004 #endif
1005 arg.flags = reply_flags;
1006 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1007 ip_hdr(skb)->saddr, /* XXX */
1008 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1009 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1010 if (oif)
1011 arg.bound_dev_if = oif;
1012 arg.tos = tos;
1013 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1014 local_bh_disable();
1015 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1016 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1017 sock_net_set(ctl_sk, net);
1018 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1019 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1020 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1021 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1022 transmit_time = tcp_transmit_time(sk);
1023 ip_send_unicast_reply(ctl_sk, sk,
1024 skb, &TCP_SKB_CB(skb)->header.h4.opt,
1025 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1026 &arg, arg.iov[0].iov_len,
1027 transmit_time, txhash);
1028
1029 sock_net_set(ctl_sk, &init_net);
1030 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1031 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1032 local_bh_enable();
1033 }
1034
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb,enum tcp_tw_status tw_status)1035 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1036 enum tcp_tw_status tw_status)
1037 {
1038 struct inet_timewait_sock *tw = inet_twsk(sk);
1039 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1040 struct tcp_key key = {};
1041 u8 tos = tw->tw_tos;
1042
1043 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1044 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1045 * being placed in a different service queues (Classic rather than L4S)
1046 */
1047 if (tw_status == TCP_TW_ACK_OOW)
1048 tos &= ~INET_ECN_MASK;
1049
1050 #ifdef CONFIG_TCP_AO
1051 struct tcp_ao_info *ao_info;
1052
1053 if (static_branch_unlikely(&tcp_ao_needed.key)) {
1054 /* FIXME: the segment to-be-acked is not verified yet */
1055 ao_info = rcu_dereference(tcptw->ao_info);
1056 if (ao_info) {
1057 const struct tcp_ao_hdr *aoh;
1058
1059 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1060 inet_twsk_put(tw);
1061 return;
1062 }
1063
1064 if (aoh)
1065 key.ao_key = tcp_ao_established_key(sk, ao_info,
1066 aoh->rnext_keyid, -1);
1067 }
1068 }
1069 if (key.ao_key) {
1070 struct tcp_ao_key *rnext_key;
1071
1072 key.traffic_key = snd_other_key(key.ao_key);
1073 key.sne = READ_ONCE(ao_info->snd_sne);
1074 rnext_key = READ_ONCE(ao_info->rnext_key);
1075 key.rcv_next = rnext_key->rcvid;
1076 key.type = TCP_KEY_AO;
1077 #else
1078 if (0) {
1079 #endif
1080 } else if (static_branch_tcp_md5()) {
1081 key.md5_key = tcp_twsk_md5_key(tcptw);
1082 if (key.md5_key)
1083 key.type = TCP_KEY_MD5;
1084 }
1085
1086 tcp_v4_send_ack(sk, skb,
1087 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1088 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1089 tcp_tw_tsval(tcptw),
1090 READ_ONCE(tcptw->tw_ts_recent),
1091 tw->tw_bound_dev_if, &key,
1092 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1093 tos,
1094 tw->tw_txhash);
1095
1096 inet_twsk_put(tw);
1097 }
1098
1099 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1100 struct request_sock *req)
1101 {
1102 struct tcp_key key = {};
1103
1104 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1105 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1106 */
1107 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1108 tcp_sk(sk)->snd_nxt;
1109
1110 #ifdef CONFIG_TCP_AO
1111 if (static_branch_unlikely(&tcp_ao_needed.key) &&
1112 tcp_rsk_used_ao(req)) {
1113 const union tcp_md5_addr *addr;
1114 const struct tcp_ao_hdr *aoh;
1115 int l3index;
1116
1117 /* Invalid TCP option size or twice included auth */
1118 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1119 return;
1120 if (!aoh)
1121 return;
1122
1123 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1124 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1125 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1126 aoh->rnext_keyid, -1);
1127 if (unlikely(!key.ao_key)) {
1128 /* Send ACK with any matching MKT for the peer */
1129 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1130 /* Matching key disappeared (user removed the key?)
1131 * let the handshake timeout.
1132 */
1133 if (!key.ao_key) {
1134 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1135 addr,
1136 ntohs(tcp_hdr(skb)->source),
1137 &ip_hdr(skb)->daddr,
1138 ntohs(tcp_hdr(skb)->dest));
1139 return;
1140 }
1141 }
1142 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1143 if (!key.traffic_key)
1144 return;
1145
1146 key.type = TCP_KEY_AO;
1147 key.rcv_next = aoh->keyid;
1148 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1149 #else
1150 if (0) {
1151 #endif
1152 } else if (static_branch_tcp_md5()) {
1153 const union tcp_md5_addr *addr;
1154 int l3index;
1155
1156 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1157 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1158 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1159 if (key.md5_key)
1160 key.type = TCP_KEY_MD5;
1161 }
1162
1163 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1164 tcp_v4_send_ack(sk, skb, seq,
1165 tcp_rsk(req)->rcv_nxt,
1166 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1167 tcp_rsk_tsval(tcp_rsk(req)),
1168 req->ts_recent,
1169 0, &key,
1170 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1171 ip_hdr(skb)->tos & ~INET_ECN_MASK,
1172 READ_ONCE(tcp_rsk(req)->txhash));
1173 if (tcp_key_is_ao(&key))
1174 kfree(key.traffic_key);
1175 }
1176
1177 /*
1178 * Send a SYN-ACK after having received a SYN.
1179 * This still operates on a request_sock only, not on a big
1180 * socket.
1181 */
1182 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1183 struct flowi *fl,
1184 struct request_sock *req,
1185 struct tcp_fastopen_cookie *foc,
1186 enum tcp_synack_type synack_type,
1187 struct sk_buff *syn_skb)
1188 {
1189 struct inet_request_sock *ireq = inet_rsk(req);
1190 struct flowi4 fl4;
1191 int err = -1;
1192 struct sk_buff *skb;
1193 u8 tos;
1194
1195 /* First, grab a route. */
1196 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1197 return -1;
1198
1199 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1200
1201 if (skb) {
1202 tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1203 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1204
1205 tos = READ_ONCE(inet_sk(sk)->tos);
1206
1207 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1208 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1209 (tos & INET_ECN_MASK);
1210
1211 if (!INET_ECN_is_capable(tos) &&
1212 tcp_bpf_ca_needs_ecn((struct sock *)req))
1213 tos |= INET_ECN_ECT_0;
1214
1215 rcu_read_lock();
1216 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1217 ireq->ir_rmt_addr,
1218 rcu_dereference(ireq->ireq_opt),
1219 tos);
1220 rcu_read_unlock();
1221 err = net_xmit_eval(err);
1222 }
1223
1224 return err;
1225 }
1226
1227 /*
1228 * IPv4 request_sock destructor.
1229 */
1230 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1231 {
1232 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1233 }
1234
1235 #ifdef CONFIG_TCP_MD5SIG
1236 /*
1237 * RFC2385 MD5 checksumming requires a mapping of
1238 * IP address->MD5 Key.
1239 * We need to maintain these in the sk structure.
1240 */
1241
1242 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1243 EXPORT_IPV6_MOD(tcp_md5_needed);
1244
1245 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1246 {
1247 if (!old)
1248 return true;
1249
1250 /* l3index always overrides non-l3index */
1251 if (old->l3index && new->l3index == 0)
1252 return false;
1253 if (old->l3index == 0 && new->l3index)
1254 return true;
1255
1256 return old->prefixlen < new->prefixlen;
1257 }
1258
1259 /* Find the Key structure for an address. */
1260 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1261 const union tcp_md5_addr *addr,
1262 int family, bool any_l3index)
1263 {
1264 const struct tcp_sock *tp = tcp_sk(sk);
1265 struct tcp_md5sig_key *key;
1266 const struct tcp_md5sig_info *md5sig;
1267 __be32 mask;
1268 struct tcp_md5sig_key *best_match = NULL;
1269 bool match;
1270
1271 /* caller either holds rcu_read_lock() or socket lock */
1272 md5sig = rcu_dereference_check(tp->md5sig_info,
1273 lockdep_sock_is_held(sk));
1274 if (!md5sig)
1275 return NULL;
1276
1277 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1278 lockdep_sock_is_held(sk)) {
1279 if (key->family != family)
1280 continue;
1281 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1282 key->l3index != l3index)
1283 continue;
1284 if (family == AF_INET) {
1285 mask = inet_make_mask(key->prefixlen);
1286 match = (key->addr.a4.s_addr & mask) ==
1287 (addr->a4.s_addr & mask);
1288 #if IS_ENABLED(CONFIG_IPV6)
1289 } else if (family == AF_INET6) {
1290 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1291 key->prefixlen);
1292 #endif
1293 } else {
1294 match = false;
1295 }
1296
1297 if (match && better_md5_match(best_match, key))
1298 best_match = key;
1299 }
1300 return best_match;
1301 }
1302 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1303
1304 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1305 const union tcp_md5_addr *addr,
1306 int family, u8 prefixlen,
1307 int l3index, u8 flags)
1308 {
1309 const struct tcp_sock *tp = tcp_sk(sk);
1310 struct tcp_md5sig_key *key;
1311 unsigned int size = sizeof(struct in_addr);
1312 const struct tcp_md5sig_info *md5sig;
1313
1314 /* caller either holds rcu_read_lock() or socket lock */
1315 md5sig = rcu_dereference_check(tp->md5sig_info,
1316 lockdep_sock_is_held(sk));
1317 if (!md5sig)
1318 return NULL;
1319 #if IS_ENABLED(CONFIG_IPV6)
1320 if (family == AF_INET6)
1321 size = sizeof(struct in6_addr);
1322 #endif
1323 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1324 lockdep_sock_is_held(sk)) {
1325 if (key->family != family)
1326 continue;
1327 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1328 continue;
1329 if (key->l3index != l3index)
1330 continue;
1331 if (!memcmp(&key->addr, addr, size) &&
1332 key->prefixlen == prefixlen)
1333 return key;
1334 }
1335 return NULL;
1336 }
1337
1338 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1339 const struct sock *addr_sk)
1340 {
1341 const union tcp_md5_addr *addr;
1342 int l3index;
1343
1344 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1345 addr_sk->sk_bound_dev_if);
1346 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1347 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1348 }
1349 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1350
1351 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1352 {
1353 struct tcp_sock *tp = tcp_sk(sk);
1354 struct tcp_md5sig_info *md5sig;
1355
1356 md5sig = kmalloc_obj(*md5sig, gfp);
1357 if (!md5sig)
1358 return -ENOMEM;
1359
1360 sk_gso_disable(sk);
1361 INIT_HLIST_HEAD(&md5sig->head);
1362 rcu_assign_pointer(tp->md5sig_info, md5sig);
1363 return 0;
1364 }
1365
1366 /* This can be called on a newly created socket, from other files */
1367 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1368 int family, u8 prefixlen, int l3index, u8 flags,
1369 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1370 {
1371 /* Add Key to the list */
1372 struct tcp_md5sig_key *key;
1373 struct tcp_sock *tp = tcp_sk(sk);
1374 struct tcp_md5sig_info *md5sig;
1375
1376 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1377 if (key) {
1378 /* Pre-existing entry - just update that one.
1379 * Note that the key might be used concurrently.
1380 * data_race() is telling kcsan that we do not care of
1381 * key mismatches, since changing MD5 key on live flows
1382 * can lead to packet drops.
1383 */
1384 data_race(memcpy(key->key, newkey, newkeylen));
1385
1386 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1387 * Also note that a reader could catch new key->keylen value
1388 * but old key->key[], this is the reason we use __GFP_ZERO
1389 * at sock_kmalloc() time below these lines.
1390 */
1391 WRITE_ONCE(key->keylen, newkeylen);
1392
1393 return 0;
1394 }
1395
1396 md5sig = rcu_dereference_protected(tp->md5sig_info,
1397 lockdep_sock_is_held(sk));
1398
1399 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1400 if (!key)
1401 return -ENOMEM;
1402
1403 memcpy(key->key, newkey, newkeylen);
1404 key->keylen = newkeylen;
1405 key->family = family;
1406 key->prefixlen = prefixlen;
1407 key->l3index = l3index;
1408 key->flags = flags;
1409 memcpy(&key->addr, addr,
1410 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1411 sizeof(struct in_addr));
1412 hlist_add_head_rcu(&key->node, &md5sig->head);
1413 return 0;
1414 }
1415
1416 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1417 int family, u8 prefixlen, int l3index, u8 flags,
1418 const u8 *newkey, u8 newkeylen)
1419 {
1420 struct tcp_sock *tp = tcp_sk(sk);
1421
1422 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1423 if (fips_enabled) {
1424 pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1425 return -EOPNOTSUPP;
1426 }
1427
1428 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1429 return -ENOMEM;
1430
1431 if (!static_branch_inc(&tcp_md5_needed.key)) {
1432 struct tcp_md5sig_info *md5sig;
1433
1434 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1435 rcu_assign_pointer(tp->md5sig_info, NULL);
1436 kfree_rcu(md5sig, rcu);
1437 return -EUSERS;
1438 }
1439 }
1440
1441 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1442 newkey, newkeylen, GFP_KERNEL);
1443 }
1444 EXPORT_IPV6_MOD(tcp_md5_do_add);
1445
1446 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1447 int family, u8 prefixlen, int l3index,
1448 struct tcp_md5sig_key *key)
1449 {
1450 struct tcp_sock *tp = tcp_sk(sk);
1451
1452 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1453
1454 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1455 return -ENOMEM;
1456
1457 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1458 struct tcp_md5sig_info *md5sig;
1459
1460 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1461 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1462 rcu_assign_pointer(tp->md5sig_info, NULL);
1463 kfree_rcu(md5sig, rcu);
1464 return -EUSERS;
1465 }
1466 }
1467
1468 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1469 key->flags, key->key, key->keylen,
1470 sk_gfp_mask(sk, GFP_ATOMIC));
1471 }
1472 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1473
1474 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1475 u8 prefixlen, int l3index, u8 flags)
1476 {
1477 struct tcp_md5sig_key *key;
1478
1479 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1480 if (!key)
1481 return -ENOENT;
1482 hlist_del_rcu(&key->node);
1483 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1484 kfree_rcu(key, rcu);
1485 return 0;
1486 }
1487 EXPORT_IPV6_MOD(tcp_md5_do_del);
1488
1489 void tcp_clear_md5_list(struct sock *sk)
1490 {
1491 struct tcp_sock *tp = tcp_sk(sk);
1492 struct tcp_md5sig_key *key;
1493 struct hlist_node *n;
1494 struct tcp_md5sig_info *md5sig;
1495
1496 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1497
1498 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1499 hlist_del(&key->node);
1500 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1501 kfree(key);
1502 }
1503 }
1504
1505 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1506 sockptr_t optval, int optlen)
1507 {
1508 struct tcp_md5sig cmd;
1509 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1510 const union tcp_md5_addr *addr;
1511 u8 prefixlen = 32;
1512 int l3index = 0;
1513 bool l3flag;
1514 u8 flags;
1515
1516 if (optlen < sizeof(cmd))
1517 return -EINVAL;
1518
1519 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1520 return -EFAULT;
1521
1522 if (sin->sin_family != AF_INET)
1523 return -EINVAL;
1524
1525 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1526 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1527
1528 if (optname == TCP_MD5SIG_EXT &&
1529 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1530 prefixlen = cmd.tcpm_prefixlen;
1531 if (prefixlen > 32)
1532 return -EINVAL;
1533 }
1534
1535 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1536 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1537 struct net_device *dev;
1538
1539 rcu_read_lock();
1540 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1541 if (dev && netif_is_l3_master(dev))
1542 l3index = dev->ifindex;
1543
1544 rcu_read_unlock();
1545
1546 /* ok to reference set/not set outside of rcu;
1547 * right now device MUST be an L3 master
1548 */
1549 if (!dev || !l3index)
1550 return -EINVAL;
1551 }
1552
1553 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1554
1555 if (!cmd.tcpm_keylen)
1556 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1557
1558 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1559 return -EINVAL;
1560
1561 /* Don't allow keys for peers that have a matching TCP-AO key.
1562 * See the comment in tcp_ao_add_cmd()
1563 */
1564 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1565 return -EKEYREJECTED;
1566
1567 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1568 cmd.tcpm_key, cmd.tcpm_keylen);
1569 }
1570
1571 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1572 __be32 daddr, __be32 saddr,
1573 const struct tcphdr *th, int nbytes)
1574 {
1575 struct {
1576 struct tcp4_pseudohdr ip;
1577 struct tcphdr tcp;
1578 } h;
1579
1580 h.ip.saddr = saddr;
1581 h.ip.daddr = daddr;
1582 h.ip.pad = 0;
1583 h.ip.protocol = IPPROTO_TCP;
1584 h.ip.len = cpu_to_be16(nbytes);
1585 h.tcp = *th;
1586 h.tcp.check = 0;
1587 md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
1588 }
1589
1590 static noinline_for_stack void
1591 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1592 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1593 {
1594 struct md5_ctx ctx;
1595
1596 md5_init(&ctx);
1597 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
1598 tcp_md5_hash_key(&ctx, key);
1599 md5_final(&ctx, md5_hash);
1600 }
1601
1602 noinline_for_stack void
1603 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1604 const struct sock *sk, const struct sk_buff *skb)
1605 {
1606 const struct tcphdr *th = tcp_hdr(skb);
1607 __be32 saddr, daddr;
1608 struct md5_ctx ctx;
1609
1610 if (sk) { /* valid for establish/request sockets */
1611 saddr = sk->sk_rcv_saddr;
1612 daddr = sk->sk_daddr;
1613 } else {
1614 const struct iphdr *iph = ip_hdr(skb);
1615 saddr = iph->saddr;
1616 daddr = iph->daddr;
1617 }
1618
1619 md5_init(&ctx);
1620 tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
1621 tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
1622 tcp_md5_hash_key(&ctx, key);
1623 md5_final(&ctx, md5_hash);
1624 }
1625 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1626
1627 #endif
1628
1629 static void tcp_v4_init_req(struct request_sock *req,
1630 const struct sock *sk_listener,
1631 struct sk_buff *skb)
1632 {
1633 struct inet_request_sock *ireq = inet_rsk(req);
1634 struct net *net = sock_net(sk_listener);
1635
1636 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1637 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1638 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1639 }
1640
1641 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1642 struct sk_buff *skb,
1643 struct flowi *fl,
1644 struct request_sock *req,
1645 u32 tw_isn)
1646 {
1647 tcp_v4_init_req(req, sk, skb);
1648
1649 if (security_inet_conn_request(sk, skb, req))
1650 return NULL;
1651
1652 return inet_csk_route_req(sk, &fl->u.ip4, req);
1653 }
1654
1655 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1656 .family = PF_INET,
1657 .obj_size = sizeof(struct tcp_request_sock),
1658 .send_ack = tcp_v4_reqsk_send_ack,
1659 .destructor = tcp_v4_reqsk_destructor,
1660 .send_reset = tcp_v4_send_reset,
1661 };
1662
1663 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1664 .mss_clamp = TCP_MSS_DEFAULT,
1665 #ifdef CONFIG_TCP_MD5SIG
1666 .req_md5_lookup = tcp_v4_md5_lookup,
1667 .calc_md5_hash = tcp_v4_md5_hash_skb,
1668 #endif
1669 #ifdef CONFIG_TCP_AO
1670 .ao_lookup = tcp_v4_ao_lookup_rsk,
1671 .ao_calc_key = tcp_v4_ao_calc_key_rsk,
1672 .ao_synack_hash = tcp_v4_ao_synack_hash,
1673 #endif
1674 #ifdef CONFIG_SYN_COOKIES
1675 .cookie_init_seq = cookie_v4_init_sequence,
1676 #endif
1677 .route_req = tcp_v4_route_req,
1678 .init_seq_and_ts_off = tcp_v4_init_seq_and_ts_off,
1679 .send_synack = tcp_v4_send_synack,
1680 };
1681
1682 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1683 {
1684 /* Never answer to SYNs send to broadcast or multicast */
1685 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1686 goto drop;
1687
1688 return tcp_conn_request(&tcp_request_sock_ops,
1689 &tcp_request_sock_ipv4_ops, sk, skb);
1690
1691 drop:
1692 tcp_listendrop(sk);
1693 return 0;
1694 }
1695 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1696
1697
1698 /*
1699 * The three way handshake has completed - we got a valid synack -
1700 * now create the new socket.
1701 */
1702 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1703 struct request_sock *req,
1704 struct dst_entry *dst,
1705 struct request_sock *req_unhash,
1706 bool *own_req,
1707 void (*opt_child_init)(struct sock *newsk,
1708 const struct sock *sk))
1709 {
1710 struct inet_request_sock *ireq;
1711 bool found_dup_sk = false;
1712 struct inet_sock *newinet;
1713 struct tcp_sock *newtp;
1714 struct sock *newsk;
1715 #ifdef CONFIG_TCP_MD5SIG
1716 const union tcp_md5_addr *addr;
1717 struct tcp_md5sig_key *key;
1718 int l3index;
1719 #endif
1720 struct ip_options_rcu *inet_opt;
1721
1722 if (sk_acceptq_is_full(sk))
1723 goto exit_overflow;
1724
1725 newsk = tcp_create_openreq_child(sk, req, skb);
1726 if (!newsk)
1727 goto exit_nonewsk;
1728
1729 newsk->sk_gso_type = SKB_GSO_TCPV4;
1730 inet_sk_rx_dst_set(newsk, skb);
1731
1732 newtp = tcp_sk(newsk);
1733 newinet = inet_sk(newsk);
1734 ireq = inet_rsk(req);
1735 inet_opt = rcu_dereference(ireq->ireq_opt);
1736 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1737 newinet->mc_index = inet_iif(skb);
1738 newinet->mc_ttl = ip_hdr(skb)->ttl;
1739 newinet->rcv_tos = ip_hdr(skb)->tos;
1740 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1741 if (inet_opt)
1742 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1743 atomic_set(&newinet->inet_id, get_random_u16());
1744
1745 /* Set ToS of the new socket based upon the value of incoming SYN.
1746 * ECT bits are set later in tcp_init_transfer().
1747 */
1748 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1749 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1750
1751 if (!dst) {
1752 dst = inet_csk_route_child_sock(sk, newsk, req);
1753 if (!dst)
1754 goto put_and_exit;
1755 } else {
1756 /* syncookie case : see end of cookie_v4_check() */
1757 }
1758 sk_setup_caps(newsk, dst);
1759
1760 #if IS_ENABLED(CONFIG_IPV6)
1761 if (opt_child_init)
1762 opt_child_init(newsk, sk);
1763 #endif
1764 tcp_ca_openreq_child(newsk, dst);
1765
1766 tcp_sync_mss(newsk, dst4_mtu(dst));
1767 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1768
1769 tcp_initialize_rcv_mss(newsk);
1770
1771 #ifdef CONFIG_TCP_MD5SIG
1772 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1773 /* Copy over the MD5 key from the original socket */
1774 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1775 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1776 if (key && !tcp_rsk_used_ao(req)) {
1777 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1778 goto put_and_exit;
1779 sk_gso_disable(newsk);
1780 }
1781 #endif
1782 #ifdef CONFIG_TCP_AO
1783 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1784 goto put_and_exit; /* OOM, release back memory */
1785 #endif
1786
1787 if (__inet_inherit_port(sk, newsk) < 0)
1788 goto put_and_exit;
1789 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1790 &found_dup_sk);
1791 if (likely(*own_req)) {
1792 tcp_move_syn(newtp, req);
1793 ireq->ireq_opt = NULL;
1794 } else {
1795 newinet->inet_opt = NULL;
1796
1797 if (!req_unhash && found_dup_sk) {
1798 /* This code path should only be executed in the
1799 * syncookie case only
1800 */
1801 bh_unlock_sock(newsk);
1802 sock_put(newsk);
1803 newsk = NULL;
1804 }
1805 }
1806 return newsk;
1807
1808 exit_overflow:
1809 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1810 exit_nonewsk:
1811 dst_release(dst);
1812 exit:
1813 tcp_listendrop(sk);
1814 return NULL;
1815 put_and_exit:
1816 newinet->inet_opt = NULL;
1817 inet_csk_prepare_forced_close(newsk);
1818 tcp_done(newsk);
1819 goto exit;
1820 }
1821 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1822
1823 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1824 {
1825 #ifdef CONFIG_SYN_COOKIES
1826 const struct tcphdr *th = tcp_hdr(skb);
1827
1828 if (!th->syn)
1829 sk = cookie_v4_check(sk, skb);
1830 #endif
1831 return sk;
1832 }
1833
1834 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1835 struct tcphdr *th, u32 *cookie)
1836 {
1837 u16 mss = 0;
1838 #ifdef CONFIG_SYN_COOKIES
1839 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1840 &tcp_request_sock_ipv4_ops, sk, th);
1841 if (mss) {
1842 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1843 tcp_synq_overflow(sk);
1844 }
1845 #endif
1846 return mss;
1847 }
1848
1849 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1850 u32));
1851 /* The socket must have it's spinlock held when we get
1852 * here, unless it is a TCP_LISTEN socket.
1853 *
1854 * We have a potential double-lock case here, so even when
1855 * doing backlog processing we use the BH locking scheme.
1856 * This is because we cannot sleep with the original spinlock
1857 * held.
1858 */
1859 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1860 {
1861 enum skb_drop_reason reason;
1862 struct sock *rsk;
1863
1864 reason = psp_sk_rx_policy_check(sk, skb);
1865 if (reason)
1866 goto err_discard;
1867
1868 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1869 struct dst_entry *dst;
1870
1871 dst = rcu_dereference_protected(sk->sk_rx_dst,
1872 lockdep_sock_is_held(sk));
1873
1874 sock_rps_save_rxhash(sk, skb);
1875 sk_mark_napi_id(sk, skb);
1876 if (dst) {
1877 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1878 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1879 dst, 0)) {
1880 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1881 dst_release(dst);
1882 }
1883 }
1884 tcp_rcv_established(sk, skb);
1885 return 0;
1886 }
1887
1888 if (tcp_checksum_complete(skb))
1889 goto csum_err;
1890
1891 if (sk->sk_state == TCP_LISTEN) {
1892 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1893
1894 if (!nsk)
1895 return 0;
1896 if (nsk != sk) {
1897 reason = tcp_child_process(sk, nsk, skb);
1898 if (reason) {
1899 rsk = nsk;
1900 goto reset;
1901 }
1902 return 0;
1903 }
1904 } else
1905 sock_rps_save_rxhash(sk, skb);
1906
1907 reason = tcp_rcv_state_process(sk, skb);
1908 if (reason) {
1909 rsk = sk;
1910 goto reset;
1911 }
1912 return 0;
1913
1914 reset:
1915 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1916 discard:
1917 sk_skb_reason_drop(sk, skb, reason);
1918 /* Be careful here. If this function gets more complicated and
1919 * gcc suffers from register pressure on the x86, sk (in %ebx)
1920 * might be destroyed here. This current version compiles correctly,
1921 * but you have been warned.
1922 */
1923 return 0;
1924
1925 csum_err:
1926 reason = SKB_DROP_REASON_TCP_CSUM;
1927 trace_tcp_bad_csum(skb);
1928 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1929 err_discard:
1930 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1931 goto discard;
1932 }
1933 EXPORT_SYMBOL(tcp_v4_do_rcv);
1934
1935 int tcp_v4_early_demux(struct sk_buff *skb)
1936 {
1937 struct net *net = dev_net_rcu(skb->dev);
1938 const struct iphdr *iph;
1939 const struct tcphdr *th;
1940 struct sock *sk;
1941
1942 if (skb->pkt_type != PACKET_HOST)
1943 return 0;
1944
1945 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1946 return 0;
1947
1948 iph = ip_hdr(skb);
1949 th = tcp_hdr(skb);
1950
1951 if (th->doff < sizeof(struct tcphdr) / 4)
1952 return 0;
1953
1954 sk = __inet_lookup_established(net, iph->saddr, th->source,
1955 iph->daddr, ntohs(th->dest),
1956 skb->skb_iif, inet_sdif(skb));
1957 if (sk) {
1958 skb->sk = sk;
1959 skb->destructor = sock_edemux;
1960 if (sk_fullsock(sk)) {
1961 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1962
1963 if (dst)
1964 dst = dst_check(dst, 0);
1965 if (dst &&
1966 sk->sk_rx_dst_ifindex == skb->skb_iif)
1967 skb_dst_set_noref(skb, dst);
1968 }
1969 }
1970 return 0;
1971 }
1972
1973 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1974 enum skb_drop_reason *reason)
1975 {
1976 u32 tail_gso_size, tail_gso_segs;
1977 struct skb_shared_info *shinfo;
1978 const struct tcphdr *th;
1979 struct tcphdr *thtail;
1980 struct sk_buff *tail;
1981 unsigned int hdrlen;
1982 bool fragstolen;
1983 u32 gso_segs;
1984 u32 gso_size;
1985 u64 limit;
1986 int delta;
1987 int err;
1988
1989 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1990 * we can fix skb->truesize to its real value to avoid future drops.
1991 * This is valid because skb is not yet charged to the socket.
1992 * It has been noticed pure SACK packets were sometimes dropped
1993 * (if cooked by drivers without copybreak feature).
1994 */
1995 skb_condense(skb);
1996
1997 tcp_cleanup_skb(skb);
1998
1999 if (unlikely(tcp_checksum_complete(skb))) {
2000 bh_unlock_sock(sk);
2001 trace_tcp_bad_csum(skb);
2002 *reason = SKB_DROP_REASON_TCP_CSUM;
2003 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2004 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2005 return true;
2006 }
2007
2008 /* Attempt coalescing to last skb in backlog, even if we are
2009 * above the limits.
2010 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2011 */
2012 th = (const struct tcphdr *)skb->data;
2013 hdrlen = th->doff * 4;
2014
2015 tail = sk->sk_backlog.tail;
2016 if (!tail)
2017 goto no_coalesce;
2018 thtail = (struct tcphdr *)tail->data;
2019
2020 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2021 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2022 ((TCP_SKB_CB(tail)->tcp_flags |
2023 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2024 !((TCP_SKB_CB(tail)->tcp_flags &
2025 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2026 ((TCP_SKB_CB(tail)->tcp_flags ^
2027 TCP_SKB_CB(skb)->tcp_flags) &
2028 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2029 !tcp_skb_can_collapse_rx(tail, skb) ||
2030 thtail->doff != th->doff ||
2031 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
2032 /* prior to PSP Rx policy check, retain exact PSP metadata */
2033 psp_skb_coalesce_diff(tail, skb))
2034 goto no_coalesce;
2035
2036 __skb_pull(skb, hdrlen);
2037
2038 shinfo = skb_shinfo(skb);
2039 gso_size = shinfo->gso_size ?: skb->len;
2040 gso_segs = shinfo->gso_segs ?: 1;
2041
2042 shinfo = skb_shinfo(tail);
2043 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2044 tail_gso_segs = shinfo->gso_segs ?: 1;
2045
2046 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2047 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2048
2049 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2050 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2051 thtail->window = th->window;
2052 }
2053
2054 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2055 * thtail->fin, so that the fast path in tcp_rcv_established()
2056 * is not entered if we append a packet with a FIN.
2057 * SYN, RST, URG are not present.
2058 * ACK is set on both packets.
2059 * PSH : we do not really care in TCP stack,
2060 * at least for 'GRO' packets.
2061 */
2062 thtail->fin |= th->fin;
2063 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2064
2065 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2066 TCP_SKB_CB(tail)->has_rxtstamp = true;
2067 tail->tstamp = skb->tstamp;
2068 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2069 }
2070
2071 /* Not as strict as GRO. We only need to carry mss max value */
2072 shinfo->gso_size = max(gso_size, tail_gso_size);
2073 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2074
2075 sk->sk_backlog.len += delta;
2076 __NET_INC_STATS(sock_net(sk),
2077 LINUX_MIB_TCPBACKLOGCOALESCE);
2078 kfree_skb_partial(skb, fragstolen);
2079 return false;
2080 }
2081 __skb_push(skb, hdrlen);
2082
2083 no_coalesce:
2084 /* sk->sk_backlog.len is reset only at the end of __release_sock().
2085 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2086 * sk_rcvbuf in normal conditions.
2087 */
2088 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2089
2090 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2091
2092 /* Only socket owner can try to collapse/prune rx queues
2093 * to reduce memory overhead, so add a little headroom here.
2094 * Few sockets backlog are possibly concurrently non empty.
2095 */
2096 limit += 64 * 1024;
2097
2098 limit = min_t(u64, limit, UINT_MAX);
2099
2100 err = sk_add_backlog(sk, skb, limit);
2101 if (unlikely(err)) {
2102 bh_unlock_sock(sk);
2103 if (err == -ENOMEM) {
2104 *reason = SKB_DROP_REASON_PFMEMALLOC;
2105 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2106 } else {
2107 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2108 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2109 }
2110 return true;
2111 }
2112 return false;
2113 }
2114 EXPORT_IPV6_MOD(tcp_add_backlog);
2115
2116 static void tcp_v4_restore_cb(struct sk_buff *skb)
2117 {
2118 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2119 sizeof(struct inet_skb_parm));
2120 }
2121
2122 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2123 const struct tcphdr *th)
2124 {
2125 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2126 * barrier() makes sure compiler wont play fool^Waliasing games.
2127 */
2128 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2129 sizeof(struct inet_skb_parm));
2130 barrier();
2131
2132 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2133 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2134 skb->len - th->doff * 4);
2135 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2136 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2137 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2138 TCP_SKB_CB(skb)->sacked = 0;
2139 TCP_SKB_CB(skb)->has_rxtstamp =
2140 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2141 }
2142
2143 /*
2144 * From tcp_input.c
2145 */
2146
2147 int tcp_v4_rcv(struct sk_buff *skb)
2148 {
2149 struct net *net = dev_net_rcu(skb->dev);
2150 enum skb_drop_reason drop_reason;
2151 enum tcp_tw_status tw_status;
2152 int sdif = inet_sdif(skb);
2153 int dif = inet_iif(skb);
2154 const struct iphdr *iph;
2155 const struct tcphdr *th;
2156 struct sock *sk = NULL;
2157 bool refcounted;
2158 int ret;
2159 u32 isn;
2160
2161 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2162 if (skb->pkt_type != PACKET_HOST)
2163 goto discard_it;
2164
2165 /* Count it even if it's bad */
2166 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2167
2168 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2169 goto discard_it;
2170
2171 th = (const struct tcphdr *)skb->data;
2172
2173 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2174 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2175 goto bad_packet;
2176 }
2177 if (!pskb_may_pull(skb, th->doff * 4))
2178 goto discard_it;
2179
2180 /* An explanation is required here, I think.
2181 * Packet length and doff are validated by header prediction,
2182 * provided case of th->doff==0 is eliminated.
2183 * So, we defer the checks. */
2184
2185 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2186 goto csum_error;
2187
2188 th = (const struct tcphdr *)skb->data;
2189 iph = ip_hdr(skb);
2190 lookup:
2191 sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2192 th->dest, sdif, &refcounted);
2193 if (!sk)
2194 goto no_tcp_socket;
2195
2196 if (sk->sk_state == TCP_TIME_WAIT)
2197 goto do_time_wait;
2198
2199 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2200 struct request_sock *req = inet_reqsk(sk);
2201 bool req_stolen = false;
2202 struct sock *nsk;
2203
2204 sk = req->rsk_listener;
2205 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2206 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2207 else
2208 drop_reason = tcp_inbound_hash(sk, req, skb,
2209 &iph->saddr, &iph->daddr,
2210 AF_INET, dif, sdif);
2211 if (unlikely(drop_reason)) {
2212 sk_drops_skbadd(sk, skb);
2213 reqsk_put(req);
2214 goto discard_it;
2215 }
2216 if (tcp_checksum_complete(skb)) {
2217 reqsk_put(req);
2218 goto csum_error;
2219 }
2220 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2221 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2222 if (!nsk) {
2223 inet_csk_reqsk_queue_drop_and_put(sk, req);
2224 goto lookup;
2225 }
2226 sk = nsk;
2227 /* reuseport_migrate_sock() has already held one sk_refcnt
2228 * before returning.
2229 */
2230 } else {
2231 /* We own a reference on the listener, increase it again
2232 * as we might lose it too soon.
2233 */
2234 sock_hold(sk);
2235 }
2236 refcounted = true;
2237 nsk = NULL;
2238 if (!tcp_filter(sk, skb, &drop_reason)) {
2239 th = (const struct tcphdr *)skb->data;
2240 iph = ip_hdr(skb);
2241 tcp_v4_fill_cb(skb, iph, th);
2242 nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2243 &drop_reason);
2244 }
2245 if (!nsk) {
2246 reqsk_put(req);
2247 if (req_stolen) {
2248 /* Another cpu got exclusive access to req
2249 * and created a full blown socket.
2250 * Try to feed this packet to this socket
2251 * instead of discarding it.
2252 */
2253 tcp_v4_restore_cb(skb);
2254 sock_put(sk);
2255 goto lookup;
2256 }
2257 goto discard_and_relse;
2258 }
2259 nf_reset_ct(skb);
2260 if (nsk == sk) {
2261 reqsk_put(req);
2262 tcp_v4_restore_cb(skb);
2263 } else {
2264 drop_reason = tcp_child_process(sk, nsk, skb);
2265 if (drop_reason) {
2266 enum sk_rst_reason rst_reason;
2267
2268 rst_reason = sk_rst_convert_drop_reason(drop_reason);
2269 tcp_v4_send_reset(nsk, skb, rst_reason);
2270 goto discard_and_relse;
2271 }
2272 sock_put(sk);
2273 return 0;
2274 }
2275 }
2276
2277 process:
2278 if (static_branch_unlikely(&ip4_min_ttl)) {
2279 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2280 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2281 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2282 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2283 goto discard_and_relse;
2284 }
2285 }
2286
2287 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2288 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2289 goto discard_and_relse;
2290 }
2291
2292 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2293 AF_INET, dif, sdif);
2294 if (drop_reason)
2295 goto discard_and_relse;
2296
2297 nf_reset_ct(skb);
2298
2299 if (tcp_filter(sk, skb, &drop_reason))
2300 goto discard_and_relse;
2301
2302 th = (const struct tcphdr *)skb->data;
2303 iph = ip_hdr(skb);
2304 tcp_v4_fill_cb(skb, iph, th);
2305
2306 skb->dev = NULL;
2307
2308 if (sk->sk_state == TCP_LISTEN) {
2309 ret = tcp_v4_do_rcv(sk, skb);
2310 goto put_and_return;
2311 }
2312
2313 sk_incoming_cpu_update(sk);
2314
2315 bh_lock_sock_nested(sk);
2316 tcp_segs_in(tcp_sk(sk), skb);
2317 ret = 0;
2318 if (!sock_owned_by_user(sk)) {
2319 ret = tcp_v4_do_rcv(sk, skb);
2320 } else {
2321 if (tcp_add_backlog(sk, skb, &drop_reason))
2322 goto discard_and_relse;
2323 }
2324 bh_unlock_sock(sk);
2325
2326 put_and_return:
2327 if (refcounted)
2328 sock_put(sk);
2329
2330 return ret;
2331
2332 no_tcp_socket:
2333 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2334 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2335 goto discard_it;
2336
2337 tcp_v4_fill_cb(skb, iph, th);
2338
2339 if (tcp_checksum_complete(skb)) {
2340 csum_error:
2341 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2342 trace_tcp_bad_csum(skb);
2343 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2344 bad_packet:
2345 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2346 } else {
2347 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2348 }
2349
2350 discard_it:
2351 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2352 /* Discard frame. */
2353 sk_skb_reason_drop(sk, skb, drop_reason);
2354 return 0;
2355
2356 discard_and_relse:
2357 sk_drops_skbadd(sk, skb);
2358 if (refcounted)
2359 sock_put(sk);
2360 goto discard_it;
2361
2362 do_time_wait:
2363 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2364 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2365 inet_twsk_put(inet_twsk(sk));
2366 goto discard_it;
2367 }
2368
2369 tcp_v4_fill_cb(skb, iph, th);
2370
2371 if (tcp_checksum_complete(skb)) {
2372 inet_twsk_put(inet_twsk(sk));
2373 goto csum_error;
2374 }
2375
2376 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2377 &drop_reason);
2378 switch (tw_status) {
2379 case TCP_TW_SYN: {
2380 struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2381 iph->saddr, th->source,
2382 iph->daddr, th->dest,
2383 inet_iif(skb),
2384 sdif);
2385 if (sk2) {
2386 inet_twsk_deschedule_put(inet_twsk(sk));
2387 sk = sk2;
2388 tcp_v4_restore_cb(skb);
2389 refcounted = false;
2390 __this_cpu_write(tcp_tw_isn, isn);
2391 goto process;
2392 }
2393
2394 drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2395 if (drop_reason)
2396 break;
2397 }
2398 /* to ACK */
2399 fallthrough;
2400 case TCP_TW_ACK:
2401 case TCP_TW_ACK_OOW:
2402 tcp_v4_timewait_ack(sk, skb, tw_status);
2403 break;
2404 case TCP_TW_RST:
2405 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2406 inet_twsk_deschedule_put(inet_twsk(sk));
2407 goto discard_it;
2408 case TCP_TW_SUCCESS:;
2409 }
2410 goto discard_it;
2411 }
2412
2413 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2414 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2415 };
2416
2417 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2418 {
2419 struct dst_entry *dst = skb_dst(skb);
2420
2421 if (dst && dst_hold_safe(dst)) {
2422 rcu_assign_pointer(sk->sk_rx_dst, dst);
2423 sk->sk_rx_dst_ifindex = skb->skb_iif;
2424 }
2425 }
2426 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2427
2428 const struct inet_connection_sock_af_ops ipv4_specific = {
2429 .queue_xmit = ip_queue_xmit,
2430 .send_check = tcp_v4_send_check,
2431 .rebuild_header = inet_sk_rebuild_header,
2432 .sk_rx_dst_set = inet_sk_rx_dst_set,
2433 .conn_request = tcp_v4_conn_request,
2434 .syn_recv_sock = tcp_v4_syn_recv_sock,
2435 .net_header_len = sizeof(struct iphdr),
2436 .setsockopt = ip_setsockopt,
2437 .getsockopt = ip_getsockopt,
2438 .mtu_reduced = tcp_v4_mtu_reduced,
2439 };
2440 EXPORT_IPV6_MOD(ipv4_specific);
2441
2442 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2443 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2444 #ifdef CONFIG_TCP_MD5SIG
2445 .md5_lookup = tcp_v4_md5_lookup,
2446 .calc_md5_hash = tcp_v4_md5_hash_skb,
2447 .md5_parse = tcp_v4_parse_md5_keys,
2448 #endif
2449 #ifdef CONFIG_TCP_AO
2450 .ao_lookup = tcp_v4_ao_lookup,
2451 .calc_ao_hash = tcp_v4_ao_hash_skb,
2452 .ao_parse = tcp_v4_parse_ao,
2453 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2454 #endif
2455 };
2456
2457 static void tcp4_destruct_sock(struct sock *sk)
2458 {
2459 tcp_md5_destruct_sock(sk);
2460 tcp_ao_destroy_sock(sk, false);
2461 inet_sock_destruct(sk);
2462 }
2463 #endif
2464
2465 /* NOTE: A lot of things set to zero explicitly by call to
2466 * sk_alloc() so need not be done here.
2467 */
2468 static int tcp_v4_init_sock(struct sock *sk)
2469 {
2470 struct inet_connection_sock *icsk = inet_csk(sk);
2471
2472 tcp_init_sock(sk);
2473
2474 icsk->icsk_af_ops = &ipv4_specific;
2475
2476 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2477 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2478 sk->sk_destruct = tcp4_destruct_sock;
2479 #endif
2480
2481 return 0;
2482 }
2483
2484 static void tcp_release_user_frags(struct sock *sk)
2485 {
2486 #ifdef CONFIG_PAGE_POOL
2487 unsigned long index;
2488 void *netmem;
2489
2490 xa_for_each(&sk->sk_user_frags, index, netmem)
2491 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2492 #endif
2493 }
2494
2495 void tcp_v4_destroy_sock(struct sock *sk)
2496 {
2497 struct tcp_sock *tp = tcp_sk(sk);
2498
2499 tcp_release_user_frags(sk);
2500
2501 xa_destroy(&sk->sk_user_frags);
2502
2503 trace_tcp_destroy_sock(sk);
2504
2505 tcp_clear_xmit_timers(sk);
2506
2507 tcp_cleanup_congestion_control(sk);
2508
2509 tcp_cleanup_ulp(sk);
2510
2511 /* Cleanup up the write buffer. */
2512 tcp_write_queue_purge(sk);
2513
2514 /* Check if we want to disable active TFO */
2515 tcp_fastopen_active_disable_ofo_check(sk);
2516
2517 /* Cleans up our, hopefully empty, out_of_order_queue. */
2518 skb_rbtree_purge(&tp->out_of_order_queue);
2519
2520 /* Clean up a referenced TCP bind bucket. */
2521 if (inet_csk(sk)->icsk_bind_hash)
2522 inet_put_port(sk);
2523
2524 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2525
2526 /* If socket is aborted during connect operation */
2527 tcp_free_fastopen_req(tp);
2528 tcp_fastopen_destroy_cipher(sk);
2529 tcp_saved_syn_free(tp);
2530
2531 sk_sockets_allocated_dec(sk);
2532 }
2533 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2534
2535 #ifdef CONFIG_PROC_FS
2536 /* Proc filesystem TCP sock list dumping. */
2537
2538 static unsigned short seq_file_family(const struct seq_file *seq);
2539
2540 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2541 {
2542 unsigned short family = seq_file_family(seq);
2543
2544 /* AF_UNSPEC is used as a match all */
2545 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2546 net_eq(sock_net(sk), seq_file_net(seq)));
2547 }
2548
2549 /* Find a non empty bucket (starting from st->bucket)
2550 * and return the first sk from it.
2551 */
2552 static void *listening_get_first(struct seq_file *seq)
2553 {
2554 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2555 struct tcp_iter_state *st = seq->private;
2556
2557 st->offset = 0;
2558 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2559 struct inet_listen_hashbucket *ilb2;
2560 struct hlist_nulls_node *node;
2561 struct sock *sk;
2562
2563 ilb2 = &hinfo->lhash2[st->bucket];
2564 if (hlist_nulls_empty(&ilb2->nulls_head))
2565 continue;
2566
2567 spin_lock(&ilb2->lock);
2568 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2569 if (seq_sk_match(seq, sk))
2570 return sk;
2571 }
2572 spin_unlock(&ilb2->lock);
2573 }
2574
2575 return NULL;
2576 }
2577
2578 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2579 * If "cur" is the last one in the st->bucket,
2580 * call listening_get_first() to return the first sk of the next
2581 * non empty bucket.
2582 */
2583 static void *listening_get_next(struct seq_file *seq, void *cur)
2584 {
2585 struct tcp_iter_state *st = seq->private;
2586 struct inet_listen_hashbucket *ilb2;
2587 struct hlist_nulls_node *node;
2588 struct inet_hashinfo *hinfo;
2589 struct sock *sk = cur;
2590
2591 ++st->num;
2592 ++st->offset;
2593
2594 sk = sk_nulls_next(sk);
2595 sk_nulls_for_each_from(sk, node) {
2596 if (seq_sk_match(seq, sk))
2597 return sk;
2598 }
2599
2600 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2601 ilb2 = &hinfo->lhash2[st->bucket];
2602 spin_unlock(&ilb2->lock);
2603 ++st->bucket;
2604 return listening_get_first(seq);
2605 }
2606
2607 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2608 {
2609 struct tcp_iter_state *st = seq->private;
2610 void *rc;
2611
2612 st->bucket = 0;
2613 st->offset = 0;
2614 rc = listening_get_first(seq);
2615
2616 while (rc && *pos) {
2617 rc = listening_get_next(seq, rc);
2618 --*pos;
2619 }
2620 return rc;
2621 }
2622
2623 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2624 const struct tcp_iter_state *st)
2625 {
2626 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2627 }
2628
2629 /*
2630 * Get first established socket starting from bucket given in st->bucket.
2631 * If st->bucket is zero, the very first socket in the hash is returned.
2632 */
2633 static void *established_get_first(struct seq_file *seq)
2634 {
2635 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2636 struct tcp_iter_state *st = seq->private;
2637
2638 st->offset = 0;
2639 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2640 struct sock *sk;
2641 struct hlist_nulls_node *node;
2642 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2643
2644 cond_resched();
2645
2646 /* Lockless fast path for the common case of empty buckets */
2647 if (empty_bucket(hinfo, st))
2648 continue;
2649
2650 spin_lock_bh(lock);
2651 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2652 if (seq_sk_match(seq, sk))
2653 return sk;
2654 }
2655 spin_unlock_bh(lock);
2656 }
2657
2658 return NULL;
2659 }
2660
2661 static void *established_get_next(struct seq_file *seq, void *cur)
2662 {
2663 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2664 struct tcp_iter_state *st = seq->private;
2665 struct hlist_nulls_node *node;
2666 struct sock *sk = cur;
2667
2668 ++st->num;
2669 ++st->offset;
2670
2671 sk = sk_nulls_next(sk);
2672
2673 sk_nulls_for_each_from(sk, node) {
2674 if (seq_sk_match(seq, sk))
2675 return sk;
2676 }
2677
2678 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2679 ++st->bucket;
2680 return established_get_first(seq);
2681 }
2682
2683 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2684 {
2685 struct tcp_iter_state *st = seq->private;
2686 void *rc;
2687
2688 st->bucket = 0;
2689 rc = established_get_first(seq);
2690
2691 while (rc && pos) {
2692 rc = established_get_next(seq, rc);
2693 --pos;
2694 }
2695 return rc;
2696 }
2697
2698 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2699 {
2700 void *rc;
2701 struct tcp_iter_state *st = seq->private;
2702
2703 st->state = TCP_SEQ_STATE_LISTENING;
2704 rc = listening_get_idx(seq, &pos);
2705
2706 if (!rc) {
2707 st->state = TCP_SEQ_STATE_ESTABLISHED;
2708 rc = established_get_idx(seq, pos);
2709 }
2710
2711 return rc;
2712 }
2713
2714 static void *tcp_seek_last_pos(struct seq_file *seq)
2715 {
2716 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2717 struct tcp_iter_state *st = seq->private;
2718 int bucket = st->bucket;
2719 int offset = st->offset;
2720 int orig_num = st->num;
2721 void *rc = NULL;
2722
2723 switch (st->state) {
2724 case TCP_SEQ_STATE_LISTENING:
2725 if (st->bucket > hinfo->lhash2_mask)
2726 break;
2727 rc = listening_get_first(seq);
2728 while (offset-- && rc && bucket == st->bucket)
2729 rc = listening_get_next(seq, rc);
2730 if (rc)
2731 break;
2732 st->bucket = 0;
2733 st->state = TCP_SEQ_STATE_ESTABLISHED;
2734 fallthrough;
2735 case TCP_SEQ_STATE_ESTABLISHED:
2736 if (st->bucket > hinfo->ehash_mask)
2737 break;
2738 rc = established_get_first(seq);
2739 while (offset-- && rc && bucket == st->bucket)
2740 rc = established_get_next(seq, rc);
2741 }
2742
2743 st->num = orig_num;
2744
2745 return rc;
2746 }
2747
2748 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2749 {
2750 struct tcp_iter_state *st = seq->private;
2751 void *rc;
2752
2753 if (*pos && *pos == st->last_pos) {
2754 rc = tcp_seek_last_pos(seq);
2755 if (rc)
2756 goto out;
2757 }
2758
2759 st->state = TCP_SEQ_STATE_LISTENING;
2760 st->num = 0;
2761 st->bucket = 0;
2762 st->offset = 0;
2763 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2764
2765 out:
2766 st->last_pos = *pos;
2767 return rc;
2768 }
2769 EXPORT_IPV6_MOD(tcp_seq_start);
2770
2771 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2772 {
2773 struct tcp_iter_state *st = seq->private;
2774 void *rc = NULL;
2775
2776 if (v == SEQ_START_TOKEN) {
2777 rc = tcp_get_idx(seq, 0);
2778 goto out;
2779 }
2780
2781 switch (st->state) {
2782 case TCP_SEQ_STATE_LISTENING:
2783 rc = listening_get_next(seq, v);
2784 if (!rc) {
2785 st->state = TCP_SEQ_STATE_ESTABLISHED;
2786 st->bucket = 0;
2787 st->offset = 0;
2788 rc = established_get_first(seq);
2789 }
2790 break;
2791 case TCP_SEQ_STATE_ESTABLISHED:
2792 rc = established_get_next(seq, v);
2793 break;
2794 }
2795 out:
2796 ++*pos;
2797 st->last_pos = *pos;
2798 return rc;
2799 }
2800 EXPORT_IPV6_MOD(tcp_seq_next);
2801
2802 void tcp_seq_stop(struct seq_file *seq, void *v)
2803 {
2804 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2805 struct tcp_iter_state *st = seq->private;
2806
2807 switch (st->state) {
2808 case TCP_SEQ_STATE_LISTENING:
2809 if (v != SEQ_START_TOKEN)
2810 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2811 break;
2812 case TCP_SEQ_STATE_ESTABLISHED:
2813 if (v)
2814 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2815 break;
2816 }
2817 }
2818 EXPORT_IPV6_MOD(tcp_seq_stop);
2819
2820 static void get_openreq4(const struct request_sock *req,
2821 struct seq_file *f, int i)
2822 {
2823 const struct inet_request_sock *ireq = inet_rsk(req);
2824 long delta = req->rsk_timer.expires - jiffies;
2825
2826 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2827 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2828 i,
2829 ireq->ir_loc_addr,
2830 ireq->ir_num,
2831 ireq->ir_rmt_addr,
2832 ntohs(ireq->ir_rmt_port),
2833 TCP_SYN_RECV,
2834 0, 0, /* could print option size, but that is af dependent. */
2835 1, /* timers active (only the expire timer) */
2836 jiffies_delta_to_clock_t(delta),
2837 req->num_timeout,
2838 from_kuid_munged(seq_user_ns(f),
2839 sk_uid(req->rsk_listener)),
2840 0, /* non standard timer */
2841 0, /* open_requests have no inode */
2842 0,
2843 req);
2844 }
2845
2846 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2847 {
2848 int timer_active;
2849 unsigned long timer_expires;
2850 const struct tcp_sock *tp = tcp_sk(sk);
2851 const struct inet_connection_sock *icsk = inet_csk(sk);
2852 const struct inet_sock *inet = inet_sk(sk);
2853 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2854 __be32 dest = inet->inet_daddr;
2855 __be32 src = inet->inet_rcv_saddr;
2856 __u16 destp = ntohs(inet->inet_dport);
2857 __u16 srcp = ntohs(inet->inet_sport);
2858 u8 icsk_pending;
2859 int rx_queue;
2860 int state;
2861
2862 icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2863 if (icsk_pending == ICSK_TIME_RETRANS ||
2864 icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2865 icsk_pending == ICSK_TIME_LOSS_PROBE) {
2866 timer_active = 1;
2867 timer_expires = tcp_timeout_expires(sk);
2868 } else if (icsk_pending == ICSK_TIME_PROBE0) {
2869 timer_active = 4;
2870 timer_expires = tcp_timeout_expires(sk);
2871 } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
2872 timer_active = 2;
2873 timer_expires = icsk->icsk_keepalive_timer.expires;
2874 } else {
2875 timer_active = 0;
2876 timer_expires = jiffies;
2877 }
2878
2879 state = inet_sk_state_load(sk);
2880 if (state == TCP_LISTEN)
2881 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2882 else
2883 /* Because we don't lock the socket,
2884 * we might find a transient negative value.
2885 */
2886 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2887 READ_ONCE(tp->copied_seq), 0);
2888
2889 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2890 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2891 i, src, srcp, dest, destp, state,
2892 READ_ONCE(tp->write_seq) - tp->snd_una,
2893 rx_queue,
2894 timer_active,
2895 jiffies_delta_to_clock_t(timer_expires - jiffies),
2896 READ_ONCE(icsk->icsk_retransmits),
2897 from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2898 READ_ONCE(icsk->icsk_probes_out),
2899 sock_i_ino(sk),
2900 refcount_read(&sk->sk_refcnt), sk,
2901 jiffies_to_clock_t(icsk->icsk_rto),
2902 jiffies_to_clock_t(icsk->icsk_ack.ato),
2903 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2904 tcp_snd_cwnd(tp),
2905 state == TCP_LISTEN ?
2906 fastopenq->max_qlen :
2907 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2908 }
2909
2910 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2911 struct seq_file *f, int i)
2912 {
2913 long delta = tw->tw_timer.expires - jiffies;
2914 __be32 dest, src;
2915 __u16 destp, srcp;
2916
2917 dest = tw->tw_daddr;
2918 src = tw->tw_rcv_saddr;
2919 destp = ntohs(tw->tw_dport);
2920 srcp = ntohs(tw->tw_sport);
2921
2922 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2923 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2924 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2925 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2926 refcount_read(&tw->tw_refcnt), tw);
2927 }
2928
2929 #define TMPSZ 150
2930
2931 static int tcp4_seq_show(struct seq_file *seq, void *v)
2932 {
2933 struct tcp_iter_state *st;
2934 struct sock *sk = v;
2935
2936 seq_setwidth(seq, TMPSZ - 1);
2937 if (v == SEQ_START_TOKEN) {
2938 seq_puts(seq, " sl local_address rem_address st tx_queue "
2939 "rx_queue tr tm->when retrnsmt uid timeout "
2940 "inode");
2941 goto out;
2942 }
2943 st = seq->private;
2944
2945 if (sk->sk_state == TCP_TIME_WAIT)
2946 get_timewait4_sock(v, seq, st->num);
2947 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2948 get_openreq4(v, seq, st->num);
2949 else
2950 get_tcp4_sock(v, seq, st->num);
2951 out:
2952 seq_pad(seq, '\n');
2953 return 0;
2954 }
2955
2956 #ifdef CONFIG_BPF_SYSCALL
2957 union bpf_tcp_iter_batch_item {
2958 struct sock *sk;
2959 __u64 cookie;
2960 };
2961
2962 struct bpf_tcp_iter_state {
2963 struct tcp_iter_state state;
2964 unsigned int cur_sk;
2965 unsigned int end_sk;
2966 unsigned int max_sk;
2967 union bpf_tcp_iter_batch_item *batch;
2968 };
2969
2970 struct bpf_iter__tcp {
2971 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2972 __bpf_md_ptr(struct sock_common *, sk_common);
2973 uid_t uid __aligned(8);
2974 };
2975
2976 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2977 struct sock_common *sk_common, uid_t uid)
2978 {
2979 struct bpf_iter__tcp ctx;
2980
2981 meta->seq_num--; /* skip SEQ_START_TOKEN */
2982 ctx.meta = meta;
2983 ctx.sk_common = sk_common;
2984 ctx.uid = uid;
2985 return bpf_iter_run_prog(prog, &ctx);
2986 }
2987
2988 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2989 {
2990 union bpf_tcp_iter_batch_item *item;
2991 unsigned int cur_sk = iter->cur_sk;
2992 __u64 cookie;
2993
2994 /* Remember the cookies of the sockets we haven't seen yet, so we can
2995 * pick up where we left off next time around.
2996 */
2997 while (cur_sk < iter->end_sk) {
2998 item = &iter->batch[cur_sk++];
2999 cookie = sock_gen_cookie(item->sk);
3000 sock_gen_put(item->sk);
3001 item->cookie = cookie;
3002 }
3003 }
3004
3005 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3006 unsigned int new_batch_sz, gfp_t flags)
3007 {
3008 union bpf_tcp_iter_batch_item *new_batch;
3009
3010 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3011 flags | __GFP_NOWARN);
3012 if (!new_batch)
3013 return -ENOMEM;
3014
3015 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3016 kvfree(iter->batch);
3017 iter->batch = new_batch;
3018 iter->max_sk = new_batch_sz;
3019
3020 return 0;
3021 }
3022
3023 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3024 union bpf_tcp_iter_batch_item *cookies,
3025 int n_cookies)
3026 {
3027 struct hlist_nulls_node *node;
3028 struct sock *sk;
3029 int i;
3030
3031 for (i = 0; i < n_cookies; i++) {
3032 sk = first_sk;
3033 sk_nulls_for_each_from(sk, node)
3034 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3035 return sk;
3036 }
3037
3038 return NULL;
3039 }
3040
3041 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3042 {
3043 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3044 struct bpf_tcp_iter_state *iter = seq->private;
3045 struct tcp_iter_state *st = &iter->state;
3046 unsigned int find_cookie = iter->cur_sk;
3047 unsigned int end_cookie = iter->end_sk;
3048 int resume_bucket = st->bucket;
3049 struct sock *sk;
3050
3051 if (end_cookie && find_cookie == end_cookie)
3052 ++st->bucket;
3053
3054 sk = listening_get_first(seq);
3055 iter->cur_sk = 0;
3056 iter->end_sk = 0;
3057
3058 if (sk && st->bucket == resume_bucket && end_cookie) {
3059 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3060 end_cookie - find_cookie);
3061 if (!sk) {
3062 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3063 ++st->bucket;
3064 sk = listening_get_first(seq);
3065 }
3066 }
3067
3068 return sk;
3069 }
3070
3071 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3072 {
3073 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3074 struct bpf_tcp_iter_state *iter = seq->private;
3075 struct tcp_iter_state *st = &iter->state;
3076 unsigned int find_cookie = iter->cur_sk;
3077 unsigned int end_cookie = iter->end_sk;
3078 int resume_bucket = st->bucket;
3079 struct sock *sk;
3080
3081 if (end_cookie && find_cookie == end_cookie)
3082 ++st->bucket;
3083
3084 sk = established_get_first(seq);
3085 iter->cur_sk = 0;
3086 iter->end_sk = 0;
3087
3088 if (sk && st->bucket == resume_bucket && end_cookie) {
3089 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3090 end_cookie - find_cookie);
3091 if (!sk) {
3092 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3093 ++st->bucket;
3094 sk = established_get_first(seq);
3095 }
3096 }
3097
3098 return sk;
3099 }
3100
3101 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3102 {
3103 struct bpf_tcp_iter_state *iter = seq->private;
3104 struct tcp_iter_state *st = &iter->state;
3105 struct sock *sk = NULL;
3106
3107 switch (st->state) {
3108 case TCP_SEQ_STATE_LISTENING:
3109 sk = bpf_iter_tcp_resume_listening(seq);
3110 if (sk)
3111 break;
3112 st->bucket = 0;
3113 st->state = TCP_SEQ_STATE_ESTABLISHED;
3114 fallthrough;
3115 case TCP_SEQ_STATE_ESTABLISHED:
3116 sk = bpf_iter_tcp_resume_established(seq);
3117 break;
3118 }
3119
3120 return sk;
3121 }
3122
3123 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3124 struct sock **start_sk)
3125 {
3126 struct bpf_tcp_iter_state *iter = seq->private;
3127 struct hlist_nulls_node *node;
3128 unsigned int expected = 1;
3129 struct sock *sk;
3130
3131 sock_hold(*start_sk);
3132 iter->batch[iter->end_sk++].sk = *start_sk;
3133
3134 sk = sk_nulls_next(*start_sk);
3135 *start_sk = NULL;
3136 sk_nulls_for_each_from(sk, node) {
3137 if (seq_sk_match(seq, sk)) {
3138 if (iter->end_sk < iter->max_sk) {
3139 sock_hold(sk);
3140 iter->batch[iter->end_sk++].sk = sk;
3141 } else if (!*start_sk) {
3142 /* Remember where we left off. */
3143 *start_sk = sk;
3144 }
3145 expected++;
3146 }
3147 }
3148
3149 return expected;
3150 }
3151
3152 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3153 struct sock **start_sk)
3154 {
3155 struct bpf_tcp_iter_state *iter = seq->private;
3156 struct hlist_nulls_node *node;
3157 unsigned int expected = 1;
3158 struct sock *sk;
3159
3160 sock_hold(*start_sk);
3161 iter->batch[iter->end_sk++].sk = *start_sk;
3162
3163 sk = sk_nulls_next(*start_sk);
3164 *start_sk = NULL;
3165 sk_nulls_for_each_from(sk, node) {
3166 if (seq_sk_match(seq, sk)) {
3167 if (iter->end_sk < iter->max_sk) {
3168 sock_hold(sk);
3169 iter->batch[iter->end_sk++].sk = sk;
3170 } else if (!*start_sk) {
3171 /* Remember where we left off. */
3172 *start_sk = sk;
3173 }
3174 expected++;
3175 }
3176 }
3177
3178 return expected;
3179 }
3180
3181 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3182 struct sock **start_sk)
3183 {
3184 struct bpf_tcp_iter_state *iter = seq->private;
3185 struct tcp_iter_state *st = &iter->state;
3186
3187 if (st->state == TCP_SEQ_STATE_LISTENING)
3188 return bpf_iter_tcp_listening_batch(seq, start_sk);
3189 else
3190 return bpf_iter_tcp_established_batch(seq, start_sk);
3191 }
3192
3193 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3194 {
3195 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3196 struct bpf_tcp_iter_state *iter = seq->private;
3197 struct tcp_iter_state *st = &iter->state;
3198
3199 if (st->state == TCP_SEQ_STATE_LISTENING)
3200 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3201 else
3202 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3203 }
3204
3205 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3206 {
3207 struct bpf_tcp_iter_state *iter = seq->private;
3208 unsigned int expected;
3209 struct sock *sk;
3210 int err;
3211
3212 sk = bpf_iter_tcp_resume(seq);
3213 if (!sk)
3214 return NULL; /* Done */
3215
3216 expected = bpf_iter_fill_batch(seq, &sk);
3217 if (likely(iter->end_sk == expected))
3218 goto done;
3219
3220 /* Batch size was too small. */
3221 bpf_iter_tcp_unlock_bucket(seq);
3222 bpf_iter_tcp_put_batch(iter);
3223 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3224 GFP_USER);
3225 if (err)
3226 return ERR_PTR(err);
3227
3228 sk = bpf_iter_tcp_resume(seq);
3229 if (!sk)
3230 return NULL; /* Done */
3231
3232 expected = bpf_iter_fill_batch(seq, &sk);
3233 if (likely(iter->end_sk == expected))
3234 goto done;
3235
3236 /* Batch size was still too small. Hold onto the lock while we try
3237 * again with a larger batch to make sure the current bucket's size
3238 * does not change in the meantime.
3239 */
3240 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3241 if (err) {
3242 bpf_iter_tcp_unlock_bucket(seq);
3243 return ERR_PTR(err);
3244 }
3245
3246 expected = bpf_iter_fill_batch(seq, &sk);
3247 WARN_ON_ONCE(iter->end_sk != expected);
3248 done:
3249 bpf_iter_tcp_unlock_bucket(seq);
3250 return iter->batch[0].sk;
3251 }
3252
3253 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3254 {
3255 /* bpf iter does not support lseek, so it always
3256 * continue from where it was stop()-ped.
3257 */
3258 if (*pos)
3259 return bpf_iter_tcp_batch(seq);
3260
3261 return SEQ_START_TOKEN;
3262 }
3263
3264 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3265 {
3266 struct bpf_tcp_iter_state *iter = seq->private;
3267 struct tcp_iter_state *st = &iter->state;
3268 struct sock *sk;
3269
3270 /* Whenever seq_next() is called, the iter->cur_sk is
3271 * done with seq_show(), so advance to the next sk in
3272 * the batch.
3273 */
3274 if (iter->cur_sk < iter->end_sk) {
3275 /* Keeping st->num consistent in tcp_iter_state.
3276 * bpf_iter_tcp does not use st->num.
3277 * meta.seq_num is used instead.
3278 */
3279 st->num++;
3280 sock_gen_put(iter->batch[iter->cur_sk++].sk);
3281 }
3282
3283 if (iter->cur_sk < iter->end_sk)
3284 sk = iter->batch[iter->cur_sk].sk;
3285 else
3286 sk = bpf_iter_tcp_batch(seq);
3287
3288 ++*pos;
3289 /* Keeping st->last_pos consistent in tcp_iter_state.
3290 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3291 */
3292 st->last_pos = *pos;
3293 return sk;
3294 }
3295
3296 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3297 {
3298 struct bpf_iter_meta meta;
3299 struct bpf_prog *prog;
3300 struct sock *sk = v;
3301 uid_t uid;
3302 int ret;
3303
3304 if (v == SEQ_START_TOKEN)
3305 return 0;
3306
3307 if (sk_fullsock(sk))
3308 lock_sock(sk);
3309
3310 if (unlikely(sk_unhashed(sk))) {
3311 ret = SEQ_SKIP;
3312 goto unlock;
3313 }
3314
3315 if (sk->sk_state == TCP_TIME_WAIT) {
3316 uid = 0;
3317 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3318 const struct request_sock *req = v;
3319
3320 uid = from_kuid_munged(seq_user_ns(seq),
3321 sk_uid(req->rsk_listener));
3322 } else {
3323 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3324 }
3325
3326 meta.seq = seq;
3327 prog = bpf_iter_get_info(&meta, false);
3328 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3329
3330 unlock:
3331 if (sk_fullsock(sk))
3332 release_sock(sk);
3333 return ret;
3334
3335 }
3336
3337 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3338 {
3339 struct bpf_tcp_iter_state *iter = seq->private;
3340 struct bpf_iter_meta meta;
3341 struct bpf_prog *prog;
3342
3343 if (!v) {
3344 meta.seq = seq;
3345 prog = bpf_iter_get_info(&meta, true);
3346 if (prog)
3347 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3348 }
3349
3350 if (iter->cur_sk < iter->end_sk)
3351 bpf_iter_tcp_put_batch(iter);
3352 }
3353
3354 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3355 .show = bpf_iter_tcp_seq_show,
3356 .start = bpf_iter_tcp_seq_start,
3357 .next = bpf_iter_tcp_seq_next,
3358 .stop = bpf_iter_tcp_seq_stop,
3359 };
3360 #endif
3361 static unsigned short seq_file_family(const struct seq_file *seq)
3362 {
3363 const struct tcp_seq_afinfo *afinfo;
3364
3365 #ifdef CONFIG_BPF_SYSCALL
3366 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3367 if (seq->op == &bpf_iter_tcp_seq_ops)
3368 return AF_UNSPEC;
3369 #endif
3370
3371 /* Iterated from proc fs */
3372 afinfo = pde_data(file_inode(seq->file));
3373 return afinfo->family;
3374 }
3375
3376 static const struct seq_operations tcp4_seq_ops = {
3377 .show = tcp4_seq_show,
3378 .start = tcp_seq_start,
3379 .next = tcp_seq_next,
3380 .stop = tcp_seq_stop,
3381 };
3382
3383 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3384 .family = AF_INET,
3385 };
3386
3387 static int __net_init tcp4_proc_init_net(struct net *net)
3388 {
3389 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3390 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3391 return -ENOMEM;
3392 return 0;
3393 }
3394
3395 static void __net_exit tcp4_proc_exit_net(struct net *net)
3396 {
3397 remove_proc_entry("tcp", net->proc_net);
3398 }
3399
3400 static struct pernet_operations tcp4_net_ops = {
3401 .init = tcp4_proc_init_net,
3402 .exit = tcp4_proc_exit_net,
3403 };
3404
3405 int __init tcp4_proc_init(void)
3406 {
3407 return register_pernet_subsys(&tcp4_net_ops);
3408 }
3409
3410 void tcp4_proc_exit(void)
3411 {
3412 unregister_pernet_subsys(&tcp4_net_ops);
3413 }
3414 #endif /* CONFIG_PROC_FS */
3415
3416 struct proto tcp_prot = {
3417 .name = "TCP",
3418 .owner = THIS_MODULE,
3419 .close = tcp_close,
3420 .pre_connect = tcp_v4_pre_connect,
3421 .connect = tcp_v4_connect,
3422 .disconnect = tcp_disconnect,
3423 .accept = inet_csk_accept,
3424 .ioctl = tcp_ioctl,
3425 .init = tcp_v4_init_sock,
3426 .destroy = tcp_v4_destroy_sock,
3427 .shutdown = tcp_shutdown,
3428 .setsockopt = tcp_setsockopt,
3429 .getsockopt = tcp_getsockopt,
3430 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3431 .keepalive = tcp_set_keepalive,
3432 .recvmsg = tcp_recvmsg,
3433 .sendmsg = tcp_sendmsg,
3434 .splice_eof = tcp_splice_eof,
3435 .backlog_rcv = tcp_v4_do_rcv,
3436 .release_cb = tcp_release_cb,
3437 .hash = inet_hash,
3438 .unhash = inet_unhash,
3439 .get_port = inet_csk_get_port,
3440 .put_port = inet_put_port,
3441 #ifdef CONFIG_BPF_SYSCALL
3442 .psock_update_sk_prot = tcp_bpf_update_proto,
3443 #endif
3444 .enter_memory_pressure = tcp_enter_memory_pressure,
3445 .leave_memory_pressure = tcp_leave_memory_pressure,
3446 .stream_memory_free = tcp_stream_memory_free,
3447 .sockets_allocated = &tcp_sockets_allocated,
3448
3449 .memory_allocated = &net_aligned_data.tcp_memory_allocated,
3450 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3451
3452 .memory_pressure = &tcp_memory_pressure,
3453 .sysctl_mem = sysctl_tcp_mem,
3454 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3455 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3456 .max_header = MAX_TCP_HEADER,
3457 .obj_size = sizeof(struct tcp_sock),
3458 .freeptr_offset = offsetof(struct tcp_sock,
3459 inet_conn.icsk_inet.sk.sk_freeptr),
3460 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3461 .twsk_prot = &tcp_timewait_sock_ops,
3462 .rsk_prot = &tcp_request_sock_ops,
3463 .h.hashinfo = NULL,
3464 .no_autobind = true,
3465 .diag_destroy = tcp_abort,
3466 };
3467 EXPORT_SYMBOL(tcp_prot);
3468
3469 static void __net_exit tcp_sk_exit(struct net *net)
3470 {
3471 if (net->ipv4.tcp_congestion_control)
3472 bpf_module_put(net->ipv4.tcp_congestion_control,
3473 net->ipv4.tcp_congestion_control->owner);
3474 }
3475
3476 static void __net_init tcp_set_hashinfo(struct net *net)
3477 {
3478 struct inet_hashinfo *hinfo;
3479 unsigned int ehash_entries;
3480 struct net *old_net;
3481
3482 if (net_eq(net, &init_net))
3483 goto fallback;
3484
3485 old_net = current->nsproxy->net_ns;
3486 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3487 if (!ehash_entries)
3488 goto fallback;
3489
3490 ehash_entries = roundup_pow_of_two(ehash_entries);
3491 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3492 if (!hinfo) {
3493 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3494 "for a netns, fallback to the global one\n",
3495 ehash_entries);
3496 fallback:
3497 hinfo = &tcp_hashinfo;
3498 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3499 }
3500
3501 net->ipv4.tcp_death_row.hashinfo = hinfo;
3502 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3503 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3504 }
3505
3506 static int __net_init tcp_sk_init(struct net *net)
3507 {
3508 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3509 net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3510 net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3511 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3512
3513 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3514 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3515 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3516 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3517 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3518
3519 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3520 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3521 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3522
3523 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3524 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3525 net->ipv4.sysctl_tcp_syncookies = 1;
3526 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3527 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3528 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3529 net->ipv4.sysctl_tcp_orphan_retries = 0;
3530 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3531 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3532 net->ipv4.sysctl_tcp_tw_reuse = 2;
3533 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3534 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3535
3536 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3537 tcp_set_hashinfo(net);
3538
3539 net->ipv4.sysctl_tcp_sack = 1;
3540 net->ipv4.sysctl_tcp_window_scaling = 1;
3541 net->ipv4.sysctl_tcp_timestamps = 1;
3542 net->ipv4.sysctl_tcp_early_retrans = 3;
3543 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3544 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3545 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3546 net->ipv4.sysctl_tcp_max_reordering = 300;
3547 net->ipv4.sysctl_tcp_dsack = 1;
3548 net->ipv4.sysctl_tcp_app_win = 31;
3549 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3550 net->ipv4.sysctl_tcp_frto = 2;
3551 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3552 net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3553 /* This limits the percentage of the congestion window which we
3554 * will allow a single TSO frame to consume. Building TSO frames
3555 * which are too large can cause TCP streams to be bursty.
3556 */
3557 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3558 /* Default TSQ limit of 4 MB */
3559 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3560
3561 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3562 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3563
3564 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3565 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3566 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3567 net->ipv4.sysctl_tcp_autocorking = 1;
3568 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3569 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3570 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3571 if (net != &init_net) {
3572 memcpy(net->ipv4.sysctl_tcp_rmem,
3573 init_net.ipv4.sysctl_tcp_rmem,
3574 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3575 memcpy(net->ipv4.sysctl_tcp_wmem,
3576 init_net.ipv4.sysctl_tcp_wmem,
3577 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3578 }
3579 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3580 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
3581 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3582 net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
3583 net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3584 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3585 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3586 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3587
3588 /* Set default values for PLB */
3589 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3590 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3591 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3592 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3593 /* Default congestion threshold for PLB to mark a round is 50% */
3594 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3595
3596 /* Reno is always built in */
3597 if (!net_eq(net, &init_net) &&
3598 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3599 init_net.ipv4.tcp_congestion_control->owner))
3600 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3601 else
3602 net->ipv4.tcp_congestion_control = &tcp_reno;
3603
3604 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3605 net->ipv4.sysctl_tcp_shrink_window = 0;
3606
3607 net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3608 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3609 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3610
3611 return 0;
3612 }
3613
3614 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3615 {
3616 struct net *net;
3617
3618 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3619 * and failed setup_net error unwinding path are serialized.
3620 *
3621 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3622 * net_exit_list, the thread that dismantles a particular twsk must
3623 * do so without other thread progressing to refcount_dec_and_test() of
3624 * tcp_death_row.tw_refcount.
3625 */
3626 mutex_lock(&tcp_exit_batch_mutex);
3627
3628 tcp_twsk_purge(net_exit_list);
3629
3630 list_for_each_entry(net, net_exit_list, exit_list) {
3631 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3632 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3633 tcp_fastopen_ctx_destroy(net);
3634 }
3635
3636 mutex_unlock(&tcp_exit_batch_mutex);
3637 }
3638
3639 static struct pernet_operations __net_initdata tcp_sk_ops = {
3640 .init = tcp_sk_init,
3641 .exit = tcp_sk_exit,
3642 .exit_batch = tcp_sk_exit_batch,
3643 };
3644
3645 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3646 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3647 struct sock_common *sk_common, uid_t uid)
3648
3649 #define INIT_BATCH_SZ 16
3650
3651 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3652 {
3653 struct bpf_tcp_iter_state *iter = priv_data;
3654 int err;
3655
3656 err = bpf_iter_init_seq_net(priv_data, aux);
3657 if (err)
3658 return err;
3659
3660 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3661 if (err) {
3662 bpf_iter_fini_seq_net(priv_data);
3663 return err;
3664 }
3665
3666 return 0;
3667 }
3668
3669 static void bpf_iter_fini_tcp(void *priv_data)
3670 {
3671 struct bpf_tcp_iter_state *iter = priv_data;
3672
3673 bpf_iter_fini_seq_net(priv_data);
3674 kvfree(iter->batch);
3675 }
3676
3677 static const struct bpf_iter_seq_info tcp_seq_info = {
3678 .seq_ops = &bpf_iter_tcp_seq_ops,
3679 .init_seq_private = bpf_iter_init_tcp,
3680 .fini_seq_private = bpf_iter_fini_tcp,
3681 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3682 };
3683
3684 static const struct bpf_func_proto *
3685 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3686 const struct bpf_prog *prog)
3687 {
3688 switch (func_id) {
3689 case BPF_FUNC_setsockopt:
3690 return &bpf_sk_setsockopt_proto;
3691 case BPF_FUNC_getsockopt:
3692 return &bpf_sk_getsockopt_proto;
3693 default:
3694 return NULL;
3695 }
3696 }
3697
3698 static struct bpf_iter_reg tcp_reg_info = {
3699 .target = "tcp",
3700 .ctx_arg_info_size = 1,
3701 .ctx_arg_info = {
3702 { offsetof(struct bpf_iter__tcp, sk_common),
3703 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3704 },
3705 .get_func_proto = bpf_iter_tcp_get_func_proto,
3706 .seq_info = &tcp_seq_info,
3707 };
3708
3709 static void __init bpf_iter_register(void)
3710 {
3711 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3712 if (bpf_iter_reg_target(&tcp_reg_info))
3713 pr_warn("Warning: could not register bpf iterator tcp\n");
3714 }
3715
3716 #endif
3717
3718 void __init tcp_v4_init(void)
3719 {
3720 int cpu, res;
3721
3722 for_each_possible_cpu(cpu) {
3723 struct sock *sk;
3724
3725 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3726 IPPROTO_TCP, &init_net);
3727 if (res)
3728 panic("Failed to create the TCP control socket.\n");
3729 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3730
3731 /* Please enforce IP_DF and IPID==0 for RST and
3732 * ACK sent in SYN-RECV and TIME-WAIT state.
3733 */
3734 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3735
3736 sk->sk_clockid = CLOCK_MONOTONIC;
3737
3738 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3739 }
3740 if (register_pernet_subsys(&tcp_sk_ops))
3741 panic("Failed to create the TCP control socket.\n");
3742
3743 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3744 bpf_iter_register();
3745 #endif
3746 }
3747