1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61 #include <linux/sock_diag.h>
62
63 #include <net/aligned_data.h>
64 #include <net/net_namespace.h>
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/inet_ecn.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/secure_seq.h>
75 #include <net/busy_poll.h>
76 #include <net/rstreason.h>
77
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/inetdevice.h>
84 #include <linux/btf_ids.h>
85 #include <linux/skbuff_ref.h>
86
87 #include <crypto/hash.h>
88 #include <linux/scatterlist.h>
89
90 #include <trace/events/tcp.h>
91
92 #ifdef CONFIG_TCP_MD5SIG
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
94 __be32 daddr, __be32 saddr, const struct tcphdr *th);
95 #endif
96
97 struct inet_hashinfo tcp_hashinfo;
98
99 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
100 .bh_lock = INIT_LOCAL_LOCK(bh_lock),
101 };
102
103 static DEFINE_MUTEX(tcp_exit_batch_mutex);
104
tcp_v4_init_seq(const struct sk_buff * skb)105 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
106 {
107 return secure_tcp_seq(ip_hdr(skb)->daddr,
108 ip_hdr(skb)->saddr,
109 tcp_hdr(skb)->dest,
110 tcp_hdr(skb)->source);
111 }
112
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)113 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
114 {
115 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
116 }
117
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
119 {
120 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
121 const struct inet_timewait_sock *tw = inet_twsk(sktw);
122 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
123 struct tcp_sock *tp = tcp_sk(sk);
124 int ts_recent_stamp;
125 u32 reuse_thresh;
126
127 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
128 reuse = 0;
129
130 if (reuse == 2) {
131 /* Still does not detect *everything* that goes through
132 * lo, since we require a loopback src or dst address
133 * or direct binding to 'lo' interface.
134 */
135 bool loopback = false;
136 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
137 loopback = true;
138 #if IS_ENABLED(CONFIG_IPV6)
139 if (tw->tw_family == AF_INET6) {
140 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
142 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
143 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
144 loopback = true;
145 } else
146 #endif
147 {
148 if (ipv4_is_loopback(tw->tw_daddr) ||
149 ipv4_is_loopback(tw->tw_rcv_saddr))
150 loopback = true;
151 }
152 if (!loopback)
153 reuse = 0;
154 }
155
156 /* With PAWS, it is safe from the viewpoint
157 of data integrity. Even without PAWS it is safe provided sequence
158 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
159
160 Actually, the idea is close to VJ's one, only timestamp cache is
161 held not per host, but per port pair and TW bucket is used as state
162 holder.
163
164 If TW bucket has been already destroyed we fall back to VJ's scheme
165 and use initial timestamp retrieved from peer table.
166 */
167 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
168 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
169 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
170 if (ts_recent_stamp &&
171 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
172 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
173 * and releasing the bucket lock.
174 */
175 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
176 return 0;
177
178 /* In case of repair and re-using TIME-WAIT sockets we still
179 * want to be sure that it is safe as above but honor the
180 * sequence numbers and time stamps set as part of the repair
181 * process.
182 *
183 * Without this check re-using a TIME-WAIT socket with TCP
184 * repair would accumulate a -1 on the repair assigned
185 * sequence number. The first time it is reused the sequence
186 * is -1, the second time -2, etc. This fixes that issue
187 * without appearing to create any others.
188 */
189 if (likely(!tp->repair)) {
190 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
191
192 if (!seq)
193 seq = 1;
194 WRITE_ONCE(tp->write_seq, seq);
195 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
196 tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
197 }
198
199 return 1;
200 }
201
202 return 0;
203 }
204 EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
205
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)206 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
207 int addr_len)
208 {
209 /* This check is replicated from tcp_v4_connect() and intended to
210 * prevent BPF program called below from accessing bytes that are out
211 * of the bound specified by user in addr_len.
212 */
213 if (addr_len < sizeof(struct sockaddr_in))
214 return -EINVAL;
215
216 sock_owned_by_me(sk);
217
218 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
219 }
220
221 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)222 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
223 {
224 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
225 struct inet_timewait_death_row *tcp_death_row;
226 struct inet_sock *inet = inet_sk(sk);
227 struct tcp_sock *tp = tcp_sk(sk);
228 struct ip_options_rcu *inet_opt;
229 struct net *net = sock_net(sk);
230 __be16 orig_sport, orig_dport;
231 __be32 daddr, nexthop;
232 struct flowi4 *fl4;
233 struct rtable *rt;
234 int err;
235
236 if (addr_len < sizeof(struct sockaddr_in))
237 return -EINVAL;
238
239 if (usin->sin_family != AF_INET)
240 return -EAFNOSUPPORT;
241
242 nexthop = daddr = usin->sin_addr.s_addr;
243 inet_opt = rcu_dereference_protected(inet->inet_opt,
244 lockdep_sock_is_held(sk));
245 if (inet_opt && inet_opt->opt.srr) {
246 if (!daddr)
247 return -EINVAL;
248 nexthop = inet_opt->opt.faddr;
249 }
250
251 orig_sport = inet->inet_sport;
252 orig_dport = usin->sin_port;
253 fl4 = &inet->cork.fl.u.ip4;
254 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
255 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
256 orig_dport, sk);
257 if (IS_ERR(rt)) {
258 err = PTR_ERR(rt);
259 if (err == -ENETUNREACH)
260 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
261 return err;
262 }
263
264 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
265 ip_rt_put(rt);
266 return -ENETUNREACH;
267 }
268
269 if (!inet_opt || !inet_opt->opt.srr)
270 daddr = fl4->daddr;
271
272 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
273
274 if (!inet->inet_saddr) {
275 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
276 if (err) {
277 ip_rt_put(rt);
278 return err;
279 }
280 } else {
281 sk_rcv_saddr_set(sk, inet->inet_saddr);
282 }
283
284 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
285 /* Reset inherited state */
286 tp->rx_opt.ts_recent = 0;
287 tp->rx_opt.ts_recent_stamp = 0;
288 if (likely(!tp->repair))
289 WRITE_ONCE(tp->write_seq, 0);
290 }
291
292 inet->inet_dport = usin->sin_port;
293 sk_daddr_set(sk, daddr);
294
295 inet_csk(sk)->icsk_ext_hdr_len = 0;
296 if (inet_opt)
297 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
298
299 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
300
301 /* Socket identity is still unknown (sport may be zero).
302 * However we set state to SYN-SENT and not releasing socket
303 * lock select source port, enter ourselves into the hash tables and
304 * complete initialization after this.
305 */
306 tcp_set_state(sk, TCP_SYN_SENT);
307 err = inet_hash_connect(tcp_death_row, sk);
308 if (err)
309 goto failure;
310
311 sk_set_txhash(sk);
312
313 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
314 inet->inet_sport, inet->inet_dport, sk);
315 if (IS_ERR(rt)) {
316 err = PTR_ERR(rt);
317 rt = NULL;
318 goto failure;
319 }
320 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
321 /* OK, now commit destination to socket. */
322 sk->sk_gso_type = SKB_GSO_TCPV4;
323 sk_setup_caps(sk, &rt->dst);
324 rt = NULL;
325
326 if (likely(!tp->repair)) {
327 if (!tp->write_seq)
328 WRITE_ONCE(tp->write_seq,
329 secure_tcp_seq(inet->inet_saddr,
330 inet->inet_daddr,
331 inet->inet_sport,
332 usin->sin_port));
333 WRITE_ONCE(tp->tsoffset,
334 secure_tcp_ts_off(net, inet->inet_saddr,
335 inet->inet_daddr));
336 }
337
338 atomic_set(&inet->inet_id, get_random_u16());
339
340 if (tcp_fastopen_defer_connect(sk, &err))
341 return err;
342 if (err)
343 goto failure;
344
345 err = tcp_connect(sk);
346
347 if (err)
348 goto failure;
349
350 return 0;
351
352 failure:
353 /*
354 * This unhashes the socket and releases the local port,
355 * if necessary.
356 */
357 tcp_set_state(sk, TCP_CLOSE);
358 inet_bhash2_reset_saddr(sk);
359 ip_rt_put(rt);
360 sk->sk_route_caps = 0;
361 inet->inet_dport = 0;
362 return err;
363 }
364 EXPORT_IPV6_MOD(tcp_v4_connect);
365
366 /*
367 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
368 * It can be called through tcp_release_cb() if socket was owned by user
369 * at the time tcp_v4_err() was called to handle ICMP message.
370 */
tcp_v4_mtu_reduced(struct sock * sk)371 void tcp_v4_mtu_reduced(struct sock *sk)
372 {
373 struct inet_sock *inet = inet_sk(sk);
374 struct dst_entry *dst;
375 u32 mtu;
376
377 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
378 return;
379 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
380 dst = inet_csk_update_pmtu(sk, mtu);
381 if (!dst)
382 return;
383
384 /* Something is about to be wrong... Remember soft error
385 * for the case, if this connection will not able to recover.
386 */
387 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
388 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
389
390 mtu = dst_mtu(dst);
391
392 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
393 ip_sk_accept_pmtu(sk) &&
394 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
395 tcp_sync_mss(sk, mtu);
396
397 /* Resend the TCP packet because it's
398 * clear that the old packet has been
399 * dropped. This is the new "fast" path mtu
400 * discovery.
401 */
402 tcp_simple_retransmit(sk);
403 } /* else let the usual retransmit timer handle it */
404 }
405 EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
406
do_redirect(struct sk_buff * skb,struct sock * sk)407 static void do_redirect(struct sk_buff *skb, struct sock *sk)
408 {
409 struct dst_entry *dst = __sk_dst_check(sk, 0);
410
411 if (dst)
412 dst->ops->redirect(dst, sk, skb);
413 }
414
415
416 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)417 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
418 {
419 struct request_sock *req = inet_reqsk(sk);
420 struct net *net = sock_net(sk);
421
422 /* ICMPs are not backlogged, hence we cannot get
423 * an established socket here.
424 */
425 if (seq != tcp_rsk(req)->snt_isn) {
426 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
427 } else if (abort) {
428 /*
429 * Still in SYN_RECV, just remove it silently.
430 * There is no good way to pass the error to the newly
431 * created socket, and POSIX does not want network
432 * errors returned from accept().
433 */
434 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
435 tcp_listendrop(req->rsk_listener);
436 }
437 reqsk_put(req);
438 }
439 EXPORT_IPV6_MOD(tcp_req_err);
440
441 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)442 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
443 {
444 struct inet_connection_sock *icsk = inet_csk(sk);
445 struct tcp_sock *tp = tcp_sk(sk);
446 struct sk_buff *skb;
447 s32 remaining;
448 u32 delta_us;
449
450 if (sock_owned_by_user(sk))
451 return;
452
453 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
454 !icsk->icsk_backoff)
455 return;
456
457 skb = tcp_rtx_queue_head(sk);
458 if (WARN_ON_ONCE(!skb))
459 return;
460
461 icsk->icsk_backoff--;
462 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
463 icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
464
465 tcp_mstamp_refresh(tp);
466 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
467 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
468
469 if (remaining > 0) {
470 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
471 } else {
472 /* RTO revert clocked out retransmission.
473 * Will retransmit now.
474 */
475 tcp_retransmit_timer(sk);
476 }
477 }
478 EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
479
480 /*
481 * This routine is called by the ICMP module when it gets some
482 * sort of error condition. If err < 0 then the socket should
483 * be closed and the error returned to the user. If err > 0
484 * it's just the icmp type << 8 | icmp code. After adjustment
485 * header points to the first 8 bytes of the tcp header. We need
486 * to find the appropriate port.
487 *
488 * The locking strategy used here is very "optimistic". When
489 * someone else accesses the socket the ICMP is just dropped
490 * and for some paths there is no check at all.
491 * A more general error queue to queue errors for later handling
492 * is probably better.
493 *
494 */
495
tcp_v4_err(struct sk_buff * skb,u32 info)496 int tcp_v4_err(struct sk_buff *skb, u32 info)
497 {
498 const struct iphdr *iph = (const struct iphdr *)skb->data;
499 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
500 struct net *net = dev_net_rcu(skb->dev);
501 const int type = icmp_hdr(skb)->type;
502 const int code = icmp_hdr(skb)->code;
503 struct request_sock *fastopen;
504 struct tcp_sock *tp;
505 u32 seq, snd_una;
506 struct sock *sk;
507 int err;
508
509 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
510 iph->daddr, th->dest, iph->saddr,
511 ntohs(th->source), inet_iif(skb), 0);
512 if (!sk) {
513 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
514 return -ENOENT;
515 }
516 if (sk->sk_state == TCP_TIME_WAIT) {
517 /* To increase the counter of ignored icmps for TCP-AO */
518 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
519 inet_twsk_put(inet_twsk(sk));
520 return 0;
521 }
522 seq = ntohl(th->seq);
523 if (sk->sk_state == TCP_NEW_SYN_RECV) {
524 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
525 type == ICMP_TIME_EXCEEDED ||
526 (type == ICMP_DEST_UNREACH &&
527 (code == ICMP_NET_UNREACH ||
528 code == ICMP_HOST_UNREACH)));
529 return 0;
530 }
531
532 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
533 sock_put(sk);
534 return 0;
535 }
536
537 bh_lock_sock(sk);
538 /* If too many ICMPs get dropped on busy
539 * servers this needs to be solved differently.
540 * We do take care of PMTU discovery (RFC1191) special case :
541 * we can receive locally generated ICMP messages while socket is held.
542 */
543 if (sock_owned_by_user(sk)) {
544 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
545 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
546 }
547 if (sk->sk_state == TCP_CLOSE)
548 goto out;
549
550 if (static_branch_unlikely(&ip4_min_ttl)) {
551 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
552 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
553 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
554 goto out;
555 }
556 }
557
558 tp = tcp_sk(sk);
559 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
560 fastopen = rcu_dereference(tp->fastopen_rsk);
561 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
562 if (sk->sk_state != TCP_LISTEN &&
563 !between(seq, snd_una, tp->snd_nxt)) {
564 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
565 goto out;
566 }
567
568 switch (type) {
569 case ICMP_REDIRECT:
570 if (!sock_owned_by_user(sk))
571 do_redirect(skb, sk);
572 goto out;
573 case ICMP_SOURCE_QUENCH:
574 /* Just silently ignore these. */
575 goto out;
576 case ICMP_PARAMETERPROB:
577 err = EPROTO;
578 break;
579 case ICMP_DEST_UNREACH:
580 if (code > NR_ICMP_UNREACH)
581 goto out;
582
583 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
584 /* We are not interested in TCP_LISTEN and open_requests
585 * (SYN-ACKs send out by Linux are always <576bytes so
586 * they should go through unfragmented).
587 */
588 if (sk->sk_state == TCP_LISTEN)
589 goto out;
590
591 WRITE_ONCE(tp->mtu_info, info);
592 if (!sock_owned_by_user(sk)) {
593 tcp_v4_mtu_reduced(sk);
594 } else {
595 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
596 sock_hold(sk);
597 }
598 goto out;
599 }
600
601 err = icmp_err_convert[code].errno;
602 /* check if this ICMP message allows revert of backoff.
603 * (see RFC 6069)
604 */
605 if (!fastopen &&
606 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
607 tcp_ld_RTO_revert(sk, seq);
608 break;
609 case ICMP_TIME_EXCEEDED:
610 err = EHOSTUNREACH;
611 break;
612 default:
613 goto out;
614 }
615
616 switch (sk->sk_state) {
617 case TCP_SYN_SENT:
618 case TCP_SYN_RECV:
619 /* Only in fast or simultaneous open. If a fast open socket is
620 * already accepted it is treated as a connected one below.
621 */
622 if (fastopen && !fastopen->sk)
623 break;
624
625 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
626
627 if (!sock_owned_by_user(sk))
628 tcp_done_with_error(sk, err);
629 else
630 WRITE_ONCE(sk->sk_err_soft, err);
631 goto out;
632 }
633
634 /* If we've already connected we will keep trying
635 * until we time out, or the user gives up.
636 *
637 * rfc1122 4.2.3.9 allows to consider as hard errors
638 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
639 * but it is obsoleted by pmtu discovery).
640 *
641 * Note, that in modern internet, where routing is unreliable
642 * and in each dark corner broken firewalls sit, sending random
643 * errors ordered by their masters even this two messages finally lose
644 * their original sense (even Linux sends invalid PORT_UNREACHs)
645 *
646 * Now we are in compliance with RFCs.
647 * --ANK (980905)
648 */
649
650 if (!sock_owned_by_user(sk) &&
651 inet_test_bit(RECVERR, sk)) {
652 WRITE_ONCE(sk->sk_err, err);
653 sk_error_report(sk);
654 } else { /* Only an error on timeout */
655 WRITE_ONCE(sk->sk_err_soft, err);
656 }
657
658 out:
659 bh_unlock_sock(sk);
660 sock_put(sk);
661 return 0;
662 }
663
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)664 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
665 {
666 struct tcphdr *th = tcp_hdr(skb);
667
668 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
669 skb->csum_start = skb_transport_header(skb) - skb->head;
670 skb->csum_offset = offsetof(struct tcphdr, check);
671 }
672
673 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)674 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
675 {
676 const struct inet_sock *inet = inet_sk(sk);
677
678 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
679 }
680 EXPORT_IPV6_MOD(tcp_v4_send_check);
681
682 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
683
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])684 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
685 const struct tcp_ao_hdr *aoh,
686 struct ip_reply_arg *arg, struct tcphdr *reply,
687 __be32 reply_options[REPLY_OPTIONS_LEN])
688 {
689 #ifdef CONFIG_TCP_AO
690 int sdif = tcp_v4_sdif(skb);
691 int dif = inet_iif(skb);
692 int l3index = sdif ? dif : 0;
693 bool allocated_traffic_key;
694 struct tcp_ao_key *key;
695 char *traffic_key;
696 bool drop = true;
697 u32 ao_sne = 0;
698 u8 keyid;
699
700 rcu_read_lock();
701 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
702 &key, &traffic_key, &allocated_traffic_key,
703 &keyid, &ao_sne))
704 goto out;
705
706 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
707 (aoh->rnext_keyid << 8) | keyid);
708 arg->iov[0].iov_len += tcp_ao_len_aligned(key);
709 reply->doff = arg->iov[0].iov_len / 4;
710
711 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
712 key, traffic_key,
713 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
714 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
715 reply, ao_sne))
716 goto out;
717 drop = false;
718 out:
719 rcu_read_unlock();
720 if (allocated_traffic_key)
721 kfree(traffic_key);
722 return drop;
723 #else
724 return true;
725 #endif
726 }
727
728 /*
729 * This routine will send an RST to the other tcp.
730 *
731 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
732 * for reset.
733 * Answer: if a packet caused RST, it is not for a socket
734 * existing in our system, if it is matched to a socket,
735 * it is just duplicate segment or bug in other side's TCP.
736 * So that we build reply only basing on parameters
737 * arrived with segment.
738 * Exception: precedence violation. We do not implement it in any case.
739 */
740
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)741 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
742 enum sk_rst_reason reason)
743 {
744 const struct tcphdr *th = tcp_hdr(skb);
745 struct {
746 struct tcphdr th;
747 __be32 opt[REPLY_OPTIONS_LEN];
748 } rep;
749 const __u8 *md5_hash_location = NULL;
750 const struct tcp_ao_hdr *aoh;
751 struct ip_reply_arg arg;
752 #ifdef CONFIG_TCP_MD5SIG
753 struct tcp_md5sig_key *key = NULL;
754 unsigned char newhash[16];
755 struct sock *sk1 = NULL;
756 int genhash;
757 #endif
758 u64 transmit_time = 0;
759 struct sock *ctl_sk;
760 struct net *net;
761 u32 txhash = 0;
762
763 /* Never send a reset in response to a reset. */
764 if (th->rst)
765 return;
766
767 /* If sk not NULL, it means we did a successful lookup and incoming
768 * route had to be correct. prequeue might have dropped our dst.
769 */
770 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
771 return;
772
773 /* Swap the send and the receive. */
774 memset(&rep, 0, sizeof(rep));
775 rep.th.dest = th->source;
776 rep.th.source = th->dest;
777 rep.th.doff = sizeof(struct tcphdr) / 4;
778 rep.th.rst = 1;
779
780 if (th->ack) {
781 rep.th.seq = th->ack_seq;
782 } else {
783 rep.th.ack = 1;
784 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
785 skb->len - (th->doff << 2));
786 }
787
788 memset(&arg, 0, sizeof(arg));
789 arg.iov[0].iov_base = (unsigned char *)&rep;
790 arg.iov[0].iov_len = sizeof(rep.th);
791
792 net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
793
794 /* Invalid TCP option size or twice included auth */
795 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
796 return;
797
798 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
799 return;
800
801 #ifdef CONFIG_TCP_MD5SIG
802 rcu_read_lock();
803 if (sk && sk_fullsock(sk)) {
804 const union tcp_md5_addr *addr;
805 int l3index;
806
807 /* sdif set, means packet ingressed via a device
808 * in an L3 domain and inet_iif is set to it.
809 */
810 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
811 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
812 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
813 } else if (md5_hash_location) {
814 const union tcp_md5_addr *addr;
815 int sdif = tcp_v4_sdif(skb);
816 int dif = inet_iif(skb);
817 int l3index;
818
819 /*
820 * active side is lost. Try to find listening socket through
821 * source port, and then find md5 key through listening socket.
822 * we are not loose security here:
823 * Incoming packet is checked with md5 hash with finding key,
824 * no RST generated if md5 hash doesn't match.
825 */
826 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
827 NULL, 0, ip_hdr(skb)->saddr,
828 th->source, ip_hdr(skb)->daddr,
829 ntohs(th->source), dif, sdif);
830 /* don't send rst if it can't find key */
831 if (!sk1)
832 goto out;
833
834 /* sdif set, means packet ingressed via a device
835 * in an L3 domain and dif is set to it.
836 */
837 l3index = sdif ? dif : 0;
838 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
839 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
840 if (!key)
841 goto out;
842
843
844 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
845 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
846 goto out;
847
848 }
849
850 if (key) {
851 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
852 (TCPOPT_NOP << 16) |
853 (TCPOPT_MD5SIG << 8) |
854 TCPOLEN_MD5SIG);
855 /* Update length and the length the header thinks exists */
856 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
857 rep.th.doff = arg.iov[0].iov_len / 4;
858
859 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
860 key, ip_hdr(skb)->saddr,
861 ip_hdr(skb)->daddr, &rep.th);
862 }
863 #endif
864 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
865 if (rep.opt[0] == 0) {
866 __be32 mrst = mptcp_reset_option(skb);
867
868 if (mrst) {
869 rep.opt[0] = mrst;
870 arg.iov[0].iov_len += sizeof(mrst);
871 rep.th.doff = arg.iov[0].iov_len / 4;
872 }
873 }
874
875 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
876 ip_hdr(skb)->saddr, /* XXX */
877 arg.iov[0].iov_len, IPPROTO_TCP, 0);
878 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
879 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
880
881 /* When socket is gone, all binding information is lost.
882 * routing might fail in this case. No choice here, if we choose to force
883 * input interface, we will misroute in case of asymmetric route.
884 */
885 if (sk)
886 arg.bound_dev_if = sk->sk_bound_dev_if;
887
888 trace_tcp_send_reset(sk, skb, reason);
889
890 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
891 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
892
893 /* ECN bits of TW reset are cleared */
894 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
895 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
896 local_bh_disable();
897 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
898 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
899
900 sock_net_set(ctl_sk, net);
901 if (sk) {
902 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
903 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
904 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
905 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
906 transmit_time = tcp_transmit_time(sk);
907 xfrm_sk_clone_policy(ctl_sk, sk);
908 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
909 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
910 } else {
911 ctl_sk->sk_mark = 0;
912 ctl_sk->sk_priority = 0;
913 }
914 ip_send_unicast_reply(ctl_sk, sk,
915 skb, &TCP_SKB_CB(skb)->header.h4.opt,
916 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917 &arg, arg.iov[0].iov_len,
918 transmit_time, txhash);
919
920 xfrm_sk_free_policy(ctl_sk);
921 sock_net_set(ctl_sk, &init_net);
922 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
923 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
924 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
925 local_bh_enable();
926
927 #ifdef CONFIG_TCP_MD5SIG
928 out:
929 rcu_read_unlock();
930 #endif
931 }
932
933 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
934 outside socket context is ugly, certainly. What can I do?
935 */
936
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)937 static void tcp_v4_send_ack(const struct sock *sk,
938 struct sk_buff *skb, u32 seq, u32 ack,
939 u32 win, u32 tsval, u32 tsecr, int oif,
940 struct tcp_key *key,
941 int reply_flags, u8 tos, u32 txhash)
942 {
943 const struct tcphdr *th = tcp_hdr(skb);
944 struct {
945 struct tcphdr th;
946 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
947 } rep;
948 struct net *net = sock_net(sk);
949 struct ip_reply_arg arg;
950 struct sock *ctl_sk;
951 u64 transmit_time;
952
953 memset(&rep.th, 0, sizeof(struct tcphdr));
954 memset(&arg, 0, sizeof(arg));
955
956 arg.iov[0].iov_base = (unsigned char *)&rep;
957 arg.iov[0].iov_len = sizeof(rep.th);
958 if (tsecr) {
959 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
960 (TCPOPT_TIMESTAMP << 8) |
961 TCPOLEN_TIMESTAMP);
962 rep.opt[1] = htonl(tsval);
963 rep.opt[2] = htonl(tsecr);
964 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
965 }
966
967 /* Swap the send and the receive. */
968 rep.th.dest = th->source;
969 rep.th.source = th->dest;
970 rep.th.doff = arg.iov[0].iov_len / 4;
971 rep.th.seq = htonl(seq);
972 rep.th.ack_seq = htonl(ack);
973 rep.th.ack = 1;
974 rep.th.window = htons(win);
975
976 #ifdef CONFIG_TCP_MD5SIG
977 if (tcp_key_is_md5(key)) {
978 int offset = (tsecr) ? 3 : 0;
979
980 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
981 (TCPOPT_NOP << 16) |
982 (TCPOPT_MD5SIG << 8) |
983 TCPOLEN_MD5SIG);
984 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
985 rep.th.doff = arg.iov[0].iov_len/4;
986
987 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
988 key->md5_key, ip_hdr(skb)->saddr,
989 ip_hdr(skb)->daddr, &rep.th);
990 }
991 #endif
992 #ifdef CONFIG_TCP_AO
993 if (tcp_key_is_ao(key)) {
994 int offset = (tsecr) ? 3 : 0;
995
996 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
997 (tcp_ao_len(key->ao_key) << 16) |
998 (key->ao_key->sndid << 8) |
999 key->rcv_next);
1000 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
1001 rep.th.doff = arg.iov[0].iov_len / 4;
1002
1003 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1004 key->ao_key, key->traffic_key,
1005 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1006 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1007 &rep.th, key->sne);
1008 }
1009 #endif
1010 arg.flags = reply_flags;
1011 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1012 ip_hdr(skb)->saddr, /* XXX */
1013 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1014 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1015 if (oif)
1016 arg.bound_dev_if = oif;
1017 arg.tos = tos;
1018 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1019 local_bh_disable();
1020 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1021 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1022 sock_net_set(ctl_sk, net);
1023 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1024 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1025 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1026 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1027 transmit_time = tcp_transmit_time(sk);
1028 ip_send_unicast_reply(ctl_sk, sk,
1029 skb, &TCP_SKB_CB(skb)->header.h4.opt,
1030 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1031 &arg, arg.iov[0].iov_len,
1032 transmit_time, txhash);
1033
1034 sock_net_set(ctl_sk, &init_net);
1035 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1036 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1037 local_bh_enable();
1038 }
1039
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb,enum tcp_tw_status tw_status)1040 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1041 enum tcp_tw_status tw_status)
1042 {
1043 struct inet_timewait_sock *tw = inet_twsk(sk);
1044 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1045 struct tcp_key key = {};
1046 u8 tos = tw->tw_tos;
1047
1048 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1049 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1050 * being placed in a different service queues (Classic rather than L4S)
1051 */
1052 if (tw_status == TCP_TW_ACK_OOW)
1053 tos &= ~INET_ECN_MASK;
1054
1055 #ifdef CONFIG_TCP_AO
1056 struct tcp_ao_info *ao_info;
1057
1058 if (static_branch_unlikely(&tcp_ao_needed.key)) {
1059 /* FIXME: the segment to-be-acked is not verified yet */
1060 ao_info = rcu_dereference(tcptw->ao_info);
1061 if (ao_info) {
1062 const struct tcp_ao_hdr *aoh;
1063
1064 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1065 inet_twsk_put(tw);
1066 return;
1067 }
1068
1069 if (aoh)
1070 key.ao_key = tcp_ao_established_key(sk, ao_info,
1071 aoh->rnext_keyid, -1);
1072 }
1073 }
1074 if (key.ao_key) {
1075 struct tcp_ao_key *rnext_key;
1076
1077 key.traffic_key = snd_other_key(key.ao_key);
1078 key.sne = READ_ONCE(ao_info->snd_sne);
1079 rnext_key = READ_ONCE(ao_info->rnext_key);
1080 key.rcv_next = rnext_key->rcvid;
1081 key.type = TCP_KEY_AO;
1082 #else
1083 if (0) {
1084 #endif
1085 } else if (static_branch_tcp_md5()) {
1086 key.md5_key = tcp_twsk_md5_key(tcptw);
1087 if (key.md5_key)
1088 key.type = TCP_KEY_MD5;
1089 }
1090
1091 tcp_v4_send_ack(sk, skb,
1092 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1093 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1094 tcp_tw_tsval(tcptw),
1095 READ_ONCE(tcptw->tw_ts_recent),
1096 tw->tw_bound_dev_if, &key,
1097 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1098 tos,
1099 tw->tw_txhash);
1100
1101 inet_twsk_put(tw);
1102 }
1103
1104 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1105 struct request_sock *req)
1106 {
1107 struct tcp_key key = {};
1108
1109 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1110 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1111 */
1112 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1113 tcp_sk(sk)->snd_nxt;
1114
1115 #ifdef CONFIG_TCP_AO
1116 if (static_branch_unlikely(&tcp_ao_needed.key) &&
1117 tcp_rsk_used_ao(req)) {
1118 const union tcp_md5_addr *addr;
1119 const struct tcp_ao_hdr *aoh;
1120 int l3index;
1121
1122 /* Invalid TCP option size or twice included auth */
1123 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1124 return;
1125 if (!aoh)
1126 return;
1127
1128 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1129 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1130 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1131 aoh->rnext_keyid, -1);
1132 if (unlikely(!key.ao_key)) {
1133 /* Send ACK with any matching MKT for the peer */
1134 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1135 /* Matching key disappeared (user removed the key?)
1136 * let the handshake timeout.
1137 */
1138 if (!key.ao_key) {
1139 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1140 addr,
1141 ntohs(tcp_hdr(skb)->source),
1142 &ip_hdr(skb)->daddr,
1143 ntohs(tcp_hdr(skb)->dest));
1144 return;
1145 }
1146 }
1147 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1148 if (!key.traffic_key)
1149 return;
1150
1151 key.type = TCP_KEY_AO;
1152 key.rcv_next = aoh->keyid;
1153 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1154 #else
1155 if (0) {
1156 #endif
1157 } else if (static_branch_tcp_md5()) {
1158 const union tcp_md5_addr *addr;
1159 int l3index;
1160
1161 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1162 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1163 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1164 if (key.md5_key)
1165 key.type = TCP_KEY_MD5;
1166 }
1167
1168 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1169 tcp_v4_send_ack(sk, skb, seq,
1170 tcp_rsk(req)->rcv_nxt,
1171 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1172 tcp_rsk_tsval(tcp_rsk(req)),
1173 req->ts_recent,
1174 0, &key,
1175 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1176 ip_hdr(skb)->tos & ~INET_ECN_MASK,
1177 READ_ONCE(tcp_rsk(req)->txhash));
1178 if (tcp_key_is_ao(&key))
1179 kfree(key.traffic_key);
1180 }
1181
1182 /*
1183 * Send a SYN-ACK after having received a SYN.
1184 * This still operates on a request_sock only, not on a big
1185 * socket.
1186 */
1187 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1188 struct flowi *fl,
1189 struct request_sock *req,
1190 struct tcp_fastopen_cookie *foc,
1191 enum tcp_synack_type synack_type,
1192 struct sk_buff *syn_skb)
1193 {
1194 const struct inet_request_sock *ireq = inet_rsk(req);
1195 struct flowi4 fl4;
1196 int err = -1;
1197 struct sk_buff *skb;
1198 u8 tos;
1199
1200 /* First, grab a route. */
1201 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1202 return -1;
1203
1204 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1205
1206 if (skb) {
1207 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1208
1209 tos = READ_ONCE(inet_sk(sk)->tos);
1210
1211 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1212 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1213 (tos & INET_ECN_MASK);
1214
1215 if (!INET_ECN_is_capable(tos) &&
1216 tcp_bpf_ca_needs_ecn((struct sock *)req))
1217 tos |= INET_ECN_ECT_0;
1218
1219 rcu_read_lock();
1220 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1221 ireq->ir_rmt_addr,
1222 rcu_dereference(ireq->ireq_opt),
1223 tos);
1224 rcu_read_unlock();
1225 err = net_xmit_eval(err);
1226 }
1227
1228 return err;
1229 }
1230
1231 /*
1232 * IPv4 request_sock destructor.
1233 */
1234 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1235 {
1236 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1237 }
1238
1239 #ifdef CONFIG_TCP_MD5SIG
1240 /*
1241 * RFC2385 MD5 checksumming requires a mapping of
1242 * IP address->MD5 Key.
1243 * We need to maintain these in the sk structure.
1244 */
1245
1246 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1247 EXPORT_IPV6_MOD(tcp_md5_needed);
1248
1249 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1250 {
1251 if (!old)
1252 return true;
1253
1254 /* l3index always overrides non-l3index */
1255 if (old->l3index && new->l3index == 0)
1256 return false;
1257 if (old->l3index == 0 && new->l3index)
1258 return true;
1259
1260 return old->prefixlen < new->prefixlen;
1261 }
1262
1263 /* Find the Key structure for an address. */
1264 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1265 const union tcp_md5_addr *addr,
1266 int family, bool any_l3index)
1267 {
1268 const struct tcp_sock *tp = tcp_sk(sk);
1269 struct tcp_md5sig_key *key;
1270 const struct tcp_md5sig_info *md5sig;
1271 __be32 mask;
1272 struct tcp_md5sig_key *best_match = NULL;
1273 bool match;
1274
1275 /* caller either holds rcu_read_lock() or socket lock */
1276 md5sig = rcu_dereference_check(tp->md5sig_info,
1277 lockdep_sock_is_held(sk));
1278 if (!md5sig)
1279 return NULL;
1280
1281 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1282 lockdep_sock_is_held(sk)) {
1283 if (key->family != family)
1284 continue;
1285 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1286 key->l3index != l3index)
1287 continue;
1288 if (family == AF_INET) {
1289 mask = inet_make_mask(key->prefixlen);
1290 match = (key->addr.a4.s_addr & mask) ==
1291 (addr->a4.s_addr & mask);
1292 #if IS_ENABLED(CONFIG_IPV6)
1293 } else if (family == AF_INET6) {
1294 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1295 key->prefixlen);
1296 #endif
1297 } else {
1298 match = false;
1299 }
1300
1301 if (match && better_md5_match(best_match, key))
1302 best_match = key;
1303 }
1304 return best_match;
1305 }
1306 EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1307
1308 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1309 const union tcp_md5_addr *addr,
1310 int family, u8 prefixlen,
1311 int l3index, u8 flags)
1312 {
1313 const struct tcp_sock *tp = tcp_sk(sk);
1314 struct tcp_md5sig_key *key;
1315 unsigned int size = sizeof(struct in_addr);
1316 const struct tcp_md5sig_info *md5sig;
1317
1318 /* caller either holds rcu_read_lock() or socket lock */
1319 md5sig = rcu_dereference_check(tp->md5sig_info,
1320 lockdep_sock_is_held(sk));
1321 if (!md5sig)
1322 return NULL;
1323 #if IS_ENABLED(CONFIG_IPV6)
1324 if (family == AF_INET6)
1325 size = sizeof(struct in6_addr);
1326 #endif
1327 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1328 lockdep_sock_is_held(sk)) {
1329 if (key->family != family)
1330 continue;
1331 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1332 continue;
1333 if (key->l3index != l3index)
1334 continue;
1335 if (!memcmp(&key->addr, addr, size) &&
1336 key->prefixlen == prefixlen)
1337 return key;
1338 }
1339 return NULL;
1340 }
1341
1342 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1343 const struct sock *addr_sk)
1344 {
1345 const union tcp_md5_addr *addr;
1346 int l3index;
1347
1348 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1349 addr_sk->sk_bound_dev_if);
1350 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1351 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1352 }
1353 EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1354
1355 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1356 {
1357 struct tcp_sock *tp = tcp_sk(sk);
1358 struct tcp_md5sig_info *md5sig;
1359
1360 md5sig = kmalloc(sizeof(*md5sig), gfp);
1361 if (!md5sig)
1362 return -ENOMEM;
1363
1364 sk_gso_disable(sk);
1365 INIT_HLIST_HEAD(&md5sig->head);
1366 rcu_assign_pointer(tp->md5sig_info, md5sig);
1367 return 0;
1368 }
1369
1370 /* This can be called on a newly created socket, from other files */
1371 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1372 int family, u8 prefixlen, int l3index, u8 flags,
1373 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1374 {
1375 /* Add Key to the list */
1376 struct tcp_md5sig_key *key;
1377 struct tcp_sock *tp = tcp_sk(sk);
1378 struct tcp_md5sig_info *md5sig;
1379
1380 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1381 if (key) {
1382 /* Pre-existing entry - just update that one.
1383 * Note that the key might be used concurrently.
1384 * data_race() is telling kcsan that we do not care of
1385 * key mismatches, since changing MD5 key on live flows
1386 * can lead to packet drops.
1387 */
1388 data_race(memcpy(key->key, newkey, newkeylen));
1389
1390 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1391 * Also note that a reader could catch new key->keylen value
1392 * but old key->key[], this is the reason we use __GFP_ZERO
1393 * at sock_kmalloc() time below these lines.
1394 */
1395 WRITE_ONCE(key->keylen, newkeylen);
1396
1397 return 0;
1398 }
1399
1400 md5sig = rcu_dereference_protected(tp->md5sig_info,
1401 lockdep_sock_is_held(sk));
1402
1403 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1404 if (!key)
1405 return -ENOMEM;
1406
1407 memcpy(key->key, newkey, newkeylen);
1408 key->keylen = newkeylen;
1409 key->family = family;
1410 key->prefixlen = prefixlen;
1411 key->l3index = l3index;
1412 key->flags = flags;
1413 memcpy(&key->addr, addr,
1414 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1415 sizeof(struct in_addr));
1416 hlist_add_head_rcu(&key->node, &md5sig->head);
1417 return 0;
1418 }
1419
1420 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1421 int family, u8 prefixlen, int l3index, u8 flags,
1422 const u8 *newkey, u8 newkeylen)
1423 {
1424 struct tcp_sock *tp = tcp_sk(sk);
1425
1426 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1427 if (tcp_md5_alloc_sigpool())
1428 return -ENOMEM;
1429
1430 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1431 tcp_md5_release_sigpool();
1432 return -ENOMEM;
1433 }
1434
1435 if (!static_branch_inc(&tcp_md5_needed.key)) {
1436 struct tcp_md5sig_info *md5sig;
1437
1438 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1439 rcu_assign_pointer(tp->md5sig_info, NULL);
1440 kfree_rcu(md5sig, rcu);
1441 tcp_md5_release_sigpool();
1442 return -EUSERS;
1443 }
1444 }
1445
1446 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1447 newkey, newkeylen, GFP_KERNEL);
1448 }
1449 EXPORT_IPV6_MOD(tcp_md5_do_add);
1450
1451 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1452 int family, u8 prefixlen, int l3index,
1453 struct tcp_md5sig_key *key)
1454 {
1455 struct tcp_sock *tp = tcp_sk(sk);
1456
1457 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1458 tcp_md5_add_sigpool();
1459
1460 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1461 tcp_md5_release_sigpool();
1462 return -ENOMEM;
1463 }
1464
1465 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1466 struct tcp_md5sig_info *md5sig;
1467
1468 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1469 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1470 rcu_assign_pointer(tp->md5sig_info, NULL);
1471 kfree_rcu(md5sig, rcu);
1472 tcp_md5_release_sigpool();
1473 return -EUSERS;
1474 }
1475 }
1476
1477 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1478 key->flags, key->key, key->keylen,
1479 sk_gfp_mask(sk, GFP_ATOMIC));
1480 }
1481 EXPORT_IPV6_MOD(tcp_md5_key_copy);
1482
1483 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1484 u8 prefixlen, int l3index, u8 flags)
1485 {
1486 struct tcp_md5sig_key *key;
1487
1488 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1489 if (!key)
1490 return -ENOENT;
1491 hlist_del_rcu(&key->node);
1492 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1493 kfree_rcu(key, rcu);
1494 return 0;
1495 }
1496 EXPORT_IPV6_MOD(tcp_md5_do_del);
1497
1498 void tcp_clear_md5_list(struct sock *sk)
1499 {
1500 struct tcp_sock *tp = tcp_sk(sk);
1501 struct tcp_md5sig_key *key;
1502 struct hlist_node *n;
1503 struct tcp_md5sig_info *md5sig;
1504
1505 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1506
1507 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1508 hlist_del_rcu(&key->node);
1509 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1510 kfree_rcu(key, rcu);
1511 }
1512 }
1513
1514 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1515 sockptr_t optval, int optlen)
1516 {
1517 struct tcp_md5sig cmd;
1518 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1519 const union tcp_md5_addr *addr;
1520 u8 prefixlen = 32;
1521 int l3index = 0;
1522 bool l3flag;
1523 u8 flags;
1524
1525 if (optlen < sizeof(cmd))
1526 return -EINVAL;
1527
1528 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1529 return -EFAULT;
1530
1531 if (sin->sin_family != AF_INET)
1532 return -EINVAL;
1533
1534 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1535 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1536
1537 if (optname == TCP_MD5SIG_EXT &&
1538 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1539 prefixlen = cmd.tcpm_prefixlen;
1540 if (prefixlen > 32)
1541 return -EINVAL;
1542 }
1543
1544 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1545 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1546 struct net_device *dev;
1547
1548 rcu_read_lock();
1549 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1550 if (dev && netif_is_l3_master(dev))
1551 l3index = dev->ifindex;
1552
1553 rcu_read_unlock();
1554
1555 /* ok to reference set/not set outside of rcu;
1556 * right now device MUST be an L3 master
1557 */
1558 if (!dev || !l3index)
1559 return -EINVAL;
1560 }
1561
1562 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1563
1564 if (!cmd.tcpm_keylen)
1565 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1566
1567 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1568 return -EINVAL;
1569
1570 /* Don't allow keys for peers that have a matching TCP-AO key.
1571 * See the comment in tcp_ao_add_cmd()
1572 */
1573 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1574 return -EKEYREJECTED;
1575
1576 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1577 cmd.tcpm_key, cmd.tcpm_keylen);
1578 }
1579
1580 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1581 __be32 daddr, __be32 saddr,
1582 const struct tcphdr *th, int nbytes)
1583 {
1584 struct tcp4_pseudohdr *bp;
1585 struct scatterlist sg;
1586 struct tcphdr *_th;
1587
1588 bp = hp->scratch;
1589 bp->saddr = saddr;
1590 bp->daddr = daddr;
1591 bp->pad = 0;
1592 bp->protocol = IPPROTO_TCP;
1593 bp->len = cpu_to_be16(nbytes);
1594
1595 _th = (struct tcphdr *)(bp + 1);
1596 memcpy(_th, th, sizeof(*th));
1597 _th->check = 0;
1598
1599 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1600 ahash_request_set_crypt(hp->req, &sg, NULL,
1601 sizeof(*bp) + sizeof(*th));
1602 return crypto_ahash_update(hp->req);
1603 }
1604
1605 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1606 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1607 {
1608 struct tcp_sigpool hp;
1609
1610 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1611 goto clear_hash_nostart;
1612
1613 if (crypto_ahash_init(hp.req))
1614 goto clear_hash;
1615 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1616 goto clear_hash;
1617 if (tcp_md5_hash_key(&hp, key))
1618 goto clear_hash;
1619 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1620 if (crypto_ahash_final(hp.req))
1621 goto clear_hash;
1622
1623 tcp_sigpool_end(&hp);
1624 return 0;
1625
1626 clear_hash:
1627 tcp_sigpool_end(&hp);
1628 clear_hash_nostart:
1629 memset(md5_hash, 0, 16);
1630 return 1;
1631 }
1632
1633 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1634 const struct sock *sk,
1635 const struct sk_buff *skb)
1636 {
1637 const struct tcphdr *th = tcp_hdr(skb);
1638 struct tcp_sigpool hp;
1639 __be32 saddr, daddr;
1640
1641 if (sk) { /* valid for establish/request sockets */
1642 saddr = sk->sk_rcv_saddr;
1643 daddr = sk->sk_daddr;
1644 } else {
1645 const struct iphdr *iph = ip_hdr(skb);
1646 saddr = iph->saddr;
1647 daddr = iph->daddr;
1648 }
1649
1650 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1651 goto clear_hash_nostart;
1652
1653 if (crypto_ahash_init(hp.req))
1654 goto clear_hash;
1655
1656 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1657 goto clear_hash;
1658 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1659 goto clear_hash;
1660 if (tcp_md5_hash_key(&hp, key))
1661 goto clear_hash;
1662 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1663 if (crypto_ahash_final(hp.req))
1664 goto clear_hash;
1665
1666 tcp_sigpool_end(&hp);
1667 return 0;
1668
1669 clear_hash:
1670 tcp_sigpool_end(&hp);
1671 clear_hash_nostart:
1672 memset(md5_hash, 0, 16);
1673 return 1;
1674 }
1675 EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1676
1677 #endif
1678
1679 static void tcp_v4_init_req(struct request_sock *req,
1680 const struct sock *sk_listener,
1681 struct sk_buff *skb)
1682 {
1683 struct inet_request_sock *ireq = inet_rsk(req);
1684 struct net *net = sock_net(sk_listener);
1685
1686 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1687 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1688 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1689 }
1690
1691 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1692 struct sk_buff *skb,
1693 struct flowi *fl,
1694 struct request_sock *req,
1695 u32 tw_isn)
1696 {
1697 tcp_v4_init_req(req, sk, skb);
1698
1699 if (security_inet_conn_request(sk, skb, req))
1700 return NULL;
1701
1702 return inet_csk_route_req(sk, &fl->u.ip4, req);
1703 }
1704
1705 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1706 .family = PF_INET,
1707 .obj_size = sizeof(struct tcp_request_sock),
1708 .send_ack = tcp_v4_reqsk_send_ack,
1709 .destructor = tcp_v4_reqsk_destructor,
1710 .send_reset = tcp_v4_send_reset,
1711 .syn_ack_timeout = tcp_syn_ack_timeout,
1712 };
1713
1714 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1715 .mss_clamp = TCP_MSS_DEFAULT,
1716 #ifdef CONFIG_TCP_MD5SIG
1717 .req_md5_lookup = tcp_v4_md5_lookup,
1718 .calc_md5_hash = tcp_v4_md5_hash_skb,
1719 #endif
1720 #ifdef CONFIG_TCP_AO
1721 .ao_lookup = tcp_v4_ao_lookup_rsk,
1722 .ao_calc_key = tcp_v4_ao_calc_key_rsk,
1723 .ao_synack_hash = tcp_v4_ao_synack_hash,
1724 #endif
1725 #ifdef CONFIG_SYN_COOKIES
1726 .cookie_init_seq = cookie_v4_init_sequence,
1727 #endif
1728 .route_req = tcp_v4_route_req,
1729 .init_seq = tcp_v4_init_seq,
1730 .init_ts_off = tcp_v4_init_ts_off,
1731 .send_synack = tcp_v4_send_synack,
1732 };
1733
1734 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1735 {
1736 /* Never answer to SYNs send to broadcast or multicast */
1737 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1738 goto drop;
1739
1740 return tcp_conn_request(&tcp_request_sock_ops,
1741 &tcp_request_sock_ipv4_ops, sk, skb);
1742
1743 drop:
1744 tcp_listendrop(sk);
1745 return 0;
1746 }
1747 EXPORT_IPV6_MOD(tcp_v4_conn_request);
1748
1749
1750 /*
1751 * The three way handshake has completed - we got a valid synack -
1752 * now create the new socket.
1753 */
1754 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1755 struct request_sock *req,
1756 struct dst_entry *dst,
1757 struct request_sock *req_unhash,
1758 bool *own_req)
1759 {
1760 struct inet_request_sock *ireq;
1761 bool found_dup_sk = false;
1762 struct inet_sock *newinet;
1763 struct tcp_sock *newtp;
1764 struct sock *newsk;
1765 #ifdef CONFIG_TCP_MD5SIG
1766 const union tcp_md5_addr *addr;
1767 struct tcp_md5sig_key *key;
1768 int l3index;
1769 #endif
1770 struct ip_options_rcu *inet_opt;
1771
1772 if (sk_acceptq_is_full(sk))
1773 goto exit_overflow;
1774
1775 newsk = tcp_create_openreq_child(sk, req, skb);
1776 if (!newsk)
1777 goto exit_nonewsk;
1778
1779 newsk->sk_gso_type = SKB_GSO_TCPV4;
1780 inet_sk_rx_dst_set(newsk, skb);
1781
1782 newtp = tcp_sk(newsk);
1783 newinet = inet_sk(newsk);
1784 ireq = inet_rsk(req);
1785 inet_opt = rcu_dereference(ireq->ireq_opt);
1786 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1787 newinet->mc_index = inet_iif(skb);
1788 newinet->mc_ttl = ip_hdr(skb)->ttl;
1789 newinet->rcv_tos = ip_hdr(skb)->tos;
1790 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1791 if (inet_opt)
1792 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1793 atomic_set(&newinet->inet_id, get_random_u16());
1794
1795 /* Set ToS of the new socket based upon the value of incoming SYN.
1796 * ECT bits are set later in tcp_init_transfer().
1797 */
1798 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1799 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1800
1801 if (!dst) {
1802 dst = inet_csk_route_child_sock(sk, newsk, req);
1803 if (!dst)
1804 goto put_and_exit;
1805 } else {
1806 /* syncookie case : see end of cookie_v4_check() */
1807 }
1808 sk_setup_caps(newsk, dst);
1809
1810 tcp_ca_openreq_child(newsk, dst);
1811
1812 tcp_sync_mss(newsk, dst_mtu(dst));
1813 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1814
1815 tcp_initialize_rcv_mss(newsk);
1816
1817 #ifdef CONFIG_TCP_MD5SIG
1818 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1819 /* Copy over the MD5 key from the original socket */
1820 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1821 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1822 if (key && !tcp_rsk_used_ao(req)) {
1823 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1824 goto put_and_exit;
1825 sk_gso_disable(newsk);
1826 }
1827 #endif
1828 #ifdef CONFIG_TCP_AO
1829 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1830 goto put_and_exit; /* OOM, release back memory */
1831 #endif
1832
1833 if (__inet_inherit_port(sk, newsk) < 0)
1834 goto put_and_exit;
1835 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1836 &found_dup_sk);
1837 if (likely(*own_req)) {
1838 tcp_move_syn(newtp, req);
1839 ireq->ireq_opt = NULL;
1840 } else {
1841 newinet->inet_opt = NULL;
1842
1843 if (!req_unhash && found_dup_sk) {
1844 /* This code path should only be executed in the
1845 * syncookie case only
1846 */
1847 bh_unlock_sock(newsk);
1848 sock_put(newsk);
1849 newsk = NULL;
1850 }
1851 }
1852 return newsk;
1853
1854 exit_overflow:
1855 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1856 exit_nonewsk:
1857 dst_release(dst);
1858 exit:
1859 tcp_listendrop(sk);
1860 return NULL;
1861 put_and_exit:
1862 newinet->inet_opt = NULL;
1863 inet_csk_prepare_forced_close(newsk);
1864 tcp_done(newsk);
1865 goto exit;
1866 }
1867 EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1868
1869 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1870 {
1871 #ifdef CONFIG_SYN_COOKIES
1872 const struct tcphdr *th = tcp_hdr(skb);
1873
1874 if (!th->syn)
1875 sk = cookie_v4_check(sk, skb);
1876 #endif
1877 return sk;
1878 }
1879
1880 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1881 struct tcphdr *th, u32 *cookie)
1882 {
1883 u16 mss = 0;
1884 #ifdef CONFIG_SYN_COOKIES
1885 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1886 &tcp_request_sock_ipv4_ops, sk, th);
1887 if (mss) {
1888 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1889 tcp_synq_overflow(sk);
1890 }
1891 #endif
1892 return mss;
1893 }
1894
1895 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1896 u32));
1897 /* The socket must have it's spinlock held when we get
1898 * here, unless it is a TCP_LISTEN socket.
1899 *
1900 * We have a potential double-lock case here, so even when
1901 * doing backlog processing we use the BH locking scheme.
1902 * This is because we cannot sleep with the original spinlock
1903 * held.
1904 */
1905 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1906 {
1907 enum skb_drop_reason reason;
1908 struct sock *rsk;
1909
1910 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1911 struct dst_entry *dst;
1912
1913 dst = rcu_dereference_protected(sk->sk_rx_dst,
1914 lockdep_sock_is_held(sk));
1915
1916 sock_rps_save_rxhash(sk, skb);
1917 sk_mark_napi_id(sk, skb);
1918 if (dst) {
1919 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1920 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1921 dst, 0)) {
1922 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1923 dst_release(dst);
1924 }
1925 }
1926 tcp_rcv_established(sk, skb);
1927 return 0;
1928 }
1929
1930 if (tcp_checksum_complete(skb))
1931 goto csum_err;
1932
1933 if (sk->sk_state == TCP_LISTEN) {
1934 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1935
1936 if (!nsk)
1937 return 0;
1938 if (nsk != sk) {
1939 reason = tcp_child_process(sk, nsk, skb);
1940 if (reason) {
1941 rsk = nsk;
1942 goto reset;
1943 }
1944 return 0;
1945 }
1946 } else
1947 sock_rps_save_rxhash(sk, skb);
1948
1949 reason = tcp_rcv_state_process(sk, skb);
1950 if (reason) {
1951 rsk = sk;
1952 goto reset;
1953 }
1954 return 0;
1955
1956 reset:
1957 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1958 discard:
1959 sk_skb_reason_drop(sk, skb, reason);
1960 /* Be careful here. If this function gets more complicated and
1961 * gcc suffers from register pressure on the x86, sk (in %ebx)
1962 * might be destroyed here. This current version compiles correctly,
1963 * but you have been warned.
1964 */
1965 return 0;
1966
1967 csum_err:
1968 reason = SKB_DROP_REASON_TCP_CSUM;
1969 trace_tcp_bad_csum(skb);
1970 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1971 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1972 goto discard;
1973 }
1974 EXPORT_SYMBOL(tcp_v4_do_rcv);
1975
1976 int tcp_v4_early_demux(struct sk_buff *skb)
1977 {
1978 struct net *net = dev_net_rcu(skb->dev);
1979 const struct iphdr *iph;
1980 const struct tcphdr *th;
1981 struct sock *sk;
1982
1983 if (skb->pkt_type != PACKET_HOST)
1984 return 0;
1985
1986 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1987 return 0;
1988
1989 iph = ip_hdr(skb);
1990 th = tcp_hdr(skb);
1991
1992 if (th->doff < sizeof(struct tcphdr) / 4)
1993 return 0;
1994
1995 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1996 iph->saddr, th->source,
1997 iph->daddr, ntohs(th->dest),
1998 skb->skb_iif, inet_sdif(skb));
1999 if (sk) {
2000 skb->sk = sk;
2001 skb->destructor = sock_edemux;
2002 if (sk_fullsock(sk)) {
2003 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2004
2005 if (dst)
2006 dst = dst_check(dst, 0);
2007 if (dst &&
2008 sk->sk_rx_dst_ifindex == skb->skb_iif)
2009 skb_dst_set_noref(skb, dst);
2010 }
2011 }
2012 return 0;
2013 }
2014
2015 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2016 enum skb_drop_reason *reason)
2017 {
2018 u32 tail_gso_size, tail_gso_segs;
2019 struct skb_shared_info *shinfo;
2020 const struct tcphdr *th;
2021 struct tcphdr *thtail;
2022 struct sk_buff *tail;
2023 unsigned int hdrlen;
2024 bool fragstolen;
2025 u32 gso_segs;
2026 u32 gso_size;
2027 u64 limit;
2028 int delta;
2029 int err;
2030
2031 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2032 * we can fix skb->truesize to its real value to avoid future drops.
2033 * This is valid because skb is not yet charged to the socket.
2034 * It has been noticed pure SACK packets were sometimes dropped
2035 * (if cooked by drivers without copybreak feature).
2036 */
2037 skb_condense(skb);
2038
2039 tcp_cleanup_skb(skb);
2040
2041 if (unlikely(tcp_checksum_complete(skb))) {
2042 bh_unlock_sock(sk);
2043 trace_tcp_bad_csum(skb);
2044 *reason = SKB_DROP_REASON_TCP_CSUM;
2045 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2046 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2047 return true;
2048 }
2049
2050 /* Attempt coalescing to last skb in backlog, even if we are
2051 * above the limits.
2052 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2053 */
2054 th = (const struct tcphdr *)skb->data;
2055 hdrlen = th->doff * 4;
2056
2057 tail = sk->sk_backlog.tail;
2058 if (!tail)
2059 goto no_coalesce;
2060 thtail = (struct tcphdr *)tail->data;
2061
2062 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2063 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2064 ((TCP_SKB_CB(tail)->tcp_flags |
2065 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2066 !((TCP_SKB_CB(tail)->tcp_flags &
2067 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2068 ((TCP_SKB_CB(tail)->tcp_flags ^
2069 TCP_SKB_CB(skb)->tcp_flags) &
2070 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2071 !tcp_skb_can_collapse_rx(tail, skb) ||
2072 thtail->doff != th->doff ||
2073 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2074 goto no_coalesce;
2075
2076 __skb_pull(skb, hdrlen);
2077
2078 shinfo = skb_shinfo(skb);
2079 gso_size = shinfo->gso_size ?: skb->len;
2080 gso_segs = shinfo->gso_segs ?: 1;
2081
2082 shinfo = skb_shinfo(tail);
2083 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2084 tail_gso_segs = shinfo->gso_segs ?: 1;
2085
2086 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2087 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2088
2089 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2090 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2091 thtail->window = th->window;
2092 }
2093
2094 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2095 * thtail->fin, so that the fast path in tcp_rcv_established()
2096 * is not entered if we append a packet with a FIN.
2097 * SYN, RST, URG are not present.
2098 * ACK is set on both packets.
2099 * PSH : we do not really care in TCP stack,
2100 * at least for 'GRO' packets.
2101 */
2102 thtail->fin |= th->fin;
2103 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2104
2105 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2106 TCP_SKB_CB(tail)->has_rxtstamp = true;
2107 tail->tstamp = skb->tstamp;
2108 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2109 }
2110
2111 /* Not as strict as GRO. We only need to carry mss max value */
2112 shinfo->gso_size = max(gso_size, tail_gso_size);
2113 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2114
2115 sk->sk_backlog.len += delta;
2116 __NET_INC_STATS(sock_net(sk),
2117 LINUX_MIB_TCPBACKLOGCOALESCE);
2118 kfree_skb_partial(skb, fragstolen);
2119 return false;
2120 }
2121 __skb_push(skb, hdrlen);
2122
2123 no_coalesce:
2124 /* sk->sk_backlog.len is reset only at the end of __release_sock().
2125 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2126 * sk_rcvbuf in normal conditions.
2127 */
2128 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2129
2130 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2131
2132 /* Only socket owner can try to collapse/prune rx queues
2133 * to reduce memory overhead, so add a little headroom here.
2134 * Few sockets backlog are possibly concurrently non empty.
2135 */
2136 limit += 64 * 1024;
2137
2138 limit = min_t(u64, limit, UINT_MAX);
2139
2140 err = sk_add_backlog(sk, skb, limit);
2141 if (unlikely(err)) {
2142 bh_unlock_sock(sk);
2143 if (err == -ENOMEM) {
2144 *reason = SKB_DROP_REASON_PFMEMALLOC;
2145 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2146 } else {
2147 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2148 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2149 }
2150 return true;
2151 }
2152 return false;
2153 }
2154 EXPORT_IPV6_MOD(tcp_add_backlog);
2155
2156 int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
2157 {
2158 struct tcphdr *th = (struct tcphdr *)skb->data;
2159
2160 return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
2161 }
2162 EXPORT_IPV6_MOD(tcp_filter);
2163
2164 static void tcp_v4_restore_cb(struct sk_buff *skb)
2165 {
2166 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2167 sizeof(struct inet_skb_parm));
2168 }
2169
2170 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2171 const struct tcphdr *th)
2172 {
2173 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2174 * barrier() makes sure compiler wont play fool^Waliasing games.
2175 */
2176 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2177 sizeof(struct inet_skb_parm));
2178 barrier();
2179
2180 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2181 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2182 skb->len - th->doff * 4);
2183 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2184 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2185 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2186 TCP_SKB_CB(skb)->sacked = 0;
2187 TCP_SKB_CB(skb)->has_rxtstamp =
2188 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2189 }
2190
2191 /*
2192 * From tcp_input.c
2193 */
2194
2195 int tcp_v4_rcv(struct sk_buff *skb)
2196 {
2197 struct net *net = dev_net_rcu(skb->dev);
2198 enum skb_drop_reason drop_reason;
2199 enum tcp_tw_status tw_status;
2200 int sdif = inet_sdif(skb);
2201 int dif = inet_iif(skb);
2202 const struct iphdr *iph;
2203 const struct tcphdr *th;
2204 struct sock *sk = NULL;
2205 bool refcounted;
2206 int ret;
2207 u32 isn;
2208
2209 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2210 if (skb->pkt_type != PACKET_HOST)
2211 goto discard_it;
2212
2213 /* Count it even if it's bad */
2214 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2215
2216 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2217 goto discard_it;
2218
2219 th = (const struct tcphdr *)skb->data;
2220
2221 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2222 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2223 goto bad_packet;
2224 }
2225 if (!pskb_may_pull(skb, th->doff * 4))
2226 goto discard_it;
2227
2228 /* An explanation is required here, I think.
2229 * Packet length and doff are validated by header prediction,
2230 * provided case of th->doff==0 is eliminated.
2231 * So, we defer the checks. */
2232
2233 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2234 goto csum_error;
2235
2236 th = (const struct tcphdr *)skb->data;
2237 iph = ip_hdr(skb);
2238 lookup:
2239 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2240 skb, __tcp_hdrlen(th), th->source,
2241 th->dest, sdif, &refcounted);
2242 if (!sk)
2243 goto no_tcp_socket;
2244
2245 if (sk->sk_state == TCP_TIME_WAIT)
2246 goto do_time_wait;
2247
2248 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2249 struct request_sock *req = inet_reqsk(sk);
2250 bool req_stolen = false;
2251 struct sock *nsk;
2252
2253 sk = req->rsk_listener;
2254 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2255 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2256 else
2257 drop_reason = tcp_inbound_hash(sk, req, skb,
2258 &iph->saddr, &iph->daddr,
2259 AF_INET, dif, sdif);
2260 if (unlikely(drop_reason)) {
2261 sk_drops_add(sk, skb);
2262 reqsk_put(req);
2263 goto discard_it;
2264 }
2265 if (tcp_checksum_complete(skb)) {
2266 reqsk_put(req);
2267 goto csum_error;
2268 }
2269 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2270 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2271 if (!nsk) {
2272 inet_csk_reqsk_queue_drop_and_put(sk, req);
2273 goto lookup;
2274 }
2275 sk = nsk;
2276 /* reuseport_migrate_sock() has already held one sk_refcnt
2277 * before returning.
2278 */
2279 } else {
2280 /* We own a reference on the listener, increase it again
2281 * as we might lose it too soon.
2282 */
2283 sock_hold(sk);
2284 }
2285 refcounted = true;
2286 nsk = NULL;
2287 if (!tcp_filter(sk, skb, &drop_reason)) {
2288 th = (const struct tcphdr *)skb->data;
2289 iph = ip_hdr(skb);
2290 tcp_v4_fill_cb(skb, iph, th);
2291 nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2292 &drop_reason);
2293 }
2294 if (!nsk) {
2295 reqsk_put(req);
2296 if (req_stolen) {
2297 /* Another cpu got exclusive access to req
2298 * and created a full blown socket.
2299 * Try to feed this packet to this socket
2300 * instead of discarding it.
2301 */
2302 tcp_v4_restore_cb(skb);
2303 sock_put(sk);
2304 goto lookup;
2305 }
2306 goto discard_and_relse;
2307 }
2308 nf_reset_ct(skb);
2309 if (nsk == sk) {
2310 reqsk_put(req);
2311 tcp_v4_restore_cb(skb);
2312 } else {
2313 drop_reason = tcp_child_process(sk, nsk, skb);
2314 if (drop_reason) {
2315 enum sk_rst_reason rst_reason;
2316
2317 rst_reason = sk_rst_convert_drop_reason(drop_reason);
2318 tcp_v4_send_reset(nsk, skb, rst_reason);
2319 goto discard_and_relse;
2320 }
2321 sock_put(sk);
2322 return 0;
2323 }
2324 }
2325
2326 process:
2327 if (static_branch_unlikely(&ip4_min_ttl)) {
2328 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2329 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2330 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2331 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2332 goto discard_and_relse;
2333 }
2334 }
2335
2336 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2337 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2338 goto discard_and_relse;
2339 }
2340
2341 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2342 AF_INET, dif, sdif);
2343 if (drop_reason)
2344 goto discard_and_relse;
2345
2346 nf_reset_ct(skb);
2347
2348 if (tcp_filter(sk, skb, &drop_reason))
2349 goto discard_and_relse;
2350
2351 th = (const struct tcphdr *)skb->data;
2352 iph = ip_hdr(skb);
2353 tcp_v4_fill_cb(skb, iph, th);
2354
2355 skb->dev = NULL;
2356
2357 if (sk->sk_state == TCP_LISTEN) {
2358 ret = tcp_v4_do_rcv(sk, skb);
2359 goto put_and_return;
2360 }
2361
2362 sk_incoming_cpu_update(sk);
2363
2364 bh_lock_sock_nested(sk);
2365 tcp_segs_in(tcp_sk(sk), skb);
2366 ret = 0;
2367 if (!sock_owned_by_user(sk)) {
2368 ret = tcp_v4_do_rcv(sk, skb);
2369 } else {
2370 if (tcp_add_backlog(sk, skb, &drop_reason))
2371 goto discard_and_relse;
2372 }
2373 bh_unlock_sock(sk);
2374
2375 put_and_return:
2376 if (refcounted)
2377 sock_put(sk);
2378
2379 return ret;
2380
2381 no_tcp_socket:
2382 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2383 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2384 goto discard_it;
2385
2386 tcp_v4_fill_cb(skb, iph, th);
2387
2388 if (tcp_checksum_complete(skb)) {
2389 csum_error:
2390 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2391 trace_tcp_bad_csum(skb);
2392 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2393 bad_packet:
2394 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2395 } else {
2396 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2397 }
2398
2399 discard_it:
2400 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2401 /* Discard frame. */
2402 sk_skb_reason_drop(sk, skb, drop_reason);
2403 return 0;
2404
2405 discard_and_relse:
2406 sk_drops_add(sk, skb);
2407 if (refcounted)
2408 sock_put(sk);
2409 goto discard_it;
2410
2411 do_time_wait:
2412 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2413 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2414 inet_twsk_put(inet_twsk(sk));
2415 goto discard_it;
2416 }
2417
2418 tcp_v4_fill_cb(skb, iph, th);
2419
2420 if (tcp_checksum_complete(skb)) {
2421 inet_twsk_put(inet_twsk(sk));
2422 goto csum_error;
2423 }
2424
2425 tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2426 &drop_reason);
2427 switch (tw_status) {
2428 case TCP_TW_SYN: {
2429 struct sock *sk2 = inet_lookup_listener(net,
2430 net->ipv4.tcp_death_row.hashinfo,
2431 skb, __tcp_hdrlen(th),
2432 iph->saddr, th->source,
2433 iph->daddr, th->dest,
2434 inet_iif(skb),
2435 sdif);
2436 if (sk2) {
2437 inet_twsk_deschedule_put(inet_twsk(sk));
2438 sk = sk2;
2439 tcp_v4_restore_cb(skb);
2440 refcounted = false;
2441 __this_cpu_write(tcp_tw_isn, isn);
2442 goto process;
2443 }
2444 }
2445 /* to ACK */
2446 fallthrough;
2447 case TCP_TW_ACK:
2448 case TCP_TW_ACK_OOW:
2449 tcp_v4_timewait_ack(sk, skb, tw_status);
2450 break;
2451 case TCP_TW_RST:
2452 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2453 inet_twsk_deschedule_put(inet_twsk(sk));
2454 goto discard_it;
2455 case TCP_TW_SUCCESS:;
2456 }
2457 goto discard_it;
2458 }
2459
2460 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2461 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2462 .twsk_destructor= tcp_twsk_destructor,
2463 };
2464
2465 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2466 {
2467 struct dst_entry *dst = skb_dst(skb);
2468
2469 if (dst && dst_hold_safe(dst)) {
2470 rcu_assign_pointer(sk->sk_rx_dst, dst);
2471 sk->sk_rx_dst_ifindex = skb->skb_iif;
2472 }
2473 }
2474 EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2475
2476 const struct inet_connection_sock_af_ops ipv4_specific = {
2477 .queue_xmit = ip_queue_xmit,
2478 .send_check = tcp_v4_send_check,
2479 .rebuild_header = inet_sk_rebuild_header,
2480 .sk_rx_dst_set = inet_sk_rx_dst_set,
2481 .conn_request = tcp_v4_conn_request,
2482 .syn_recv_sock = tcp_v4_syn_recv_sock,
2483 .net_header_len = sizeof(struct iphdr),
2484 .setsockopt = ip_setsockopt,
2485 .getsockopt = ip_getsockopt,
2486 .mtu_reduced = tcp_v4_mtu_reduced,
2487 };
2488 EXPORT_IPV6_MOD(ipv4_specific);
2489
2490 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2491 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2492 #ifdef CONFIG_TCP_MD5SIG
2493 .md5_lookup = tcp_v4_md5_lookup,
2494 .calc_md5_hash = tcp_v4_md5_hash_skb,
2495 .md5_parse = tcp_v4_parse_md5_keys,
2496 #endif
2497 #ifdef CONFIG_TCP_AO
2498 .ao_lookup = tcp_v4_ao_lookup,
2499 .calc_ao_hash = tcp_v4_ao_hash_skb,
2500 .ao_parse = tcp_v4_parse_ao,
2501 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2502 #endif
2503 };
2504 #endif
2505
2506 /* NOTE: A lot of things set to zero explicitly by call to
2507 * sk_alloc() so need not be done here.
2508 */
2509 static int tcp_v4_init_sock(struct sock *sk)
2510 {
2511 struct inet_connection_sock *icsk = inet_csk(sk);
2512
2513 tcp_init_sock(sk);
2514
2515 icsk->icsk_af_ops = &ipv4_specific;
2516
2517 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2518 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2519 #endif
2520
2521 return 0;
2522 }
2523
2524 #ifdef CONFIG_TCP_MD5SIG
2525 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2526 {
2527 struct tcp_md5sig_info *md5sig;
2528
2529 md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2530 kfree(md5sig);
2531 static_branch_slow_dec_deferred(&tcp_md5_needed);
2532 tcp_md5_release_sigpool();
2533 }
2534 #endif
2535
2536 static void tcp_release_user_frags(struct sock *sk)
2537 {
2538 #ifdef CONFIG_PAGE_POOL
2539 unsigned long index;
2540 void *netmem;
2541
2542 xa_for_each(&sk->sk_user_frags, index, netmem)
2543 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2544 #endif
2545 }
2546
2547 void tcp_v4_destroy_sock(struct sock *sk)
2548 {
2549 struct tcp_sock *tp = tcp_sk(sk);
2550
2551 tcp_release_user_frags(sk);
2552
2553 xa_destroy(&sk->sk_user_frags);
2554
2555 trace_tcp_destroy_sock(sk);
2556
2557 tcp_clear_xmit_timers(sk);
2558
2559 tcp_cleanup_congestion_control(sk);
2560
2561 tcp_cleanup_ulp(sk);
2562
2563 /* Cleanup up the write buffer. */
2564 tcp_write_queue_purge(sk);
2565
2566 /* Check if we want to disable active TFO */
2567 tcp_fastopen_active_disable_ofo_check(sk);
2568
2569 /* Cleans up our, hopefully empty, out_of_order_queue. */
2570 skb_rbtree_purge(&tp->out_of_order_queue);
2571
2572 #ifdef CONFIG_TCP_MD5SIG
2573 /* Clean up the MD5 key list, if any */
2574 if (tp->md5sig_info) {
2575 struct tcp_md5sig_info *md5sig;
2576
2577 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2578 tcp_clear_md5_list(sk);
2579 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2580 rcu_assign_pointer(tp->md5sig_info, NULL);
2581 }
2582 #endif
2583 tcp_ao_destroy_sock(sk, false);
2584
2585 /* Clean up a referenced TCP bind bucket. */
2586 if (inet_csk(sk)->icsk_bind_hash)
2587 inet_put_port(sk);
2588
2589 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2590
2591 /* If socket is aborted during connect operation */
2592 tcp_free_fastopen_req(tp);
2593 tcp_fastopen_destroy_cipher(sk);
2594 tcp_saved_syn_free(tp);
2595
2596 sk_sockets_allocated_dec(sk);
2597 }
2598 EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2599
2600 #ifdef CONFIG_PROC_FS
2601 /* Proc filesystem TCP sock list dumping. */
2602
2603 static unsigned short seq_file_family(const struct seq_file *seq);
2604
2605 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2606 {
2607 unsigned short family = seq_file_family(seq);
2608
2609 /* AF_UNSPEC is used as a match all */
2610 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2611 net_eq(sock_net(sk), seq_file_net(seq)));
2612 }
2613
2614 /* Find a non empty bucket (starting from st->bucket)
2615 * and return the first sk from it.
2616 */
2617 static void *listening_get_first(struct seq_file *seq)
2618 {
2619 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2620 struct tcp_iter_state *st = seq->private;
2621
2622 st->offset = 0;
2623 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2624 struct inet_listen_hashbucket *ilb2;
2625 struct hlist_nulls_node *node;
2626 struct sock *sk;
2627
2628 ilb2 = &hinfo->lhash2[st->bucket];
2629 if (hlist_nulls_empty(&ilb2->nulls_head))
2630 continue;
2631
2632 spin_lock(&ilb2->lock);
2633 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2634 if (seq_sk_match(seq, sk))
2635 return sk;
2636 }
2637 spin_unlock(&ilb2->lock);
2638 }
2639
2640 return NULL;
2641 }
2642
2643 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2644 * If "cur" is the last one in the st->bucket,
2645 * call listening_get_first() to return the first sk of the next
2646 * non empty bucket.
2647 */
2648 static void *listening_get_next(struct seq_file *seq, void *cur)
2649 {
2650 struct tcp_iter_state *st = seq->private;
2651 struct inet_listen_hashbucket *ilb2;
2652 struct hlist_nulls_node *node;
2653 struct inet_hashinfo *hinfo;
2654 struct sock *sk = cur;
2655
2656 ++st->num;
2657 ++st->offset;
2658
2659 sk = sk_nulls_next(sk);
2660 sk_nulls_for_each_from(sk, node) {
2661 if (seq_sk_match(seq, sk))
2662 return sk;
2663 }
2664
2665 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2666 ilb2 = &hinfo->lhash2[st->bucket];
2667 spin_unlock(&ilb2->lock);
2668 ++st->bucket;
2669 return listening_get_first(seq);
2670 }
2671
2672 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2673 {
2674 struct tcp_iter_state *st = seq->private;
2675 void *rc;
2676
2677 st->bucket = 0;
2678 st->offset = 0;
2679 rc = listening_get_first(seq);
2680
2681 while (rc && *pos) {
2682 rc = listening_get_next(seq, rc);
2683 --*pos;
2684 }
2685 return rc;
2686 }
2687
2688 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2689 const struct tcp_iter_state *st)
2690 {
2691 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2692 }
2693
2694 /*
2695 * Get first established socket starting from bucket given in st->bucket.
2696 * If st->bucket is zero, the very first socket in the hash is returned.
2697 */
2698 static void *established_get_first(struct seq_file *seq)
2699 {
2700 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2701 struct tcp_iter_state *st = seq->private;
2702
2703 st->offset = 0;
2704 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2705 struct sock *sk;
2706 struct hlist_nulls_node *node;
2707 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2708
2709 cond_resched();
2710
2711 /* Lockless fast path for the common case of empty buckets */
2712 if (empty_bucket(hinfo, st))
2713 continue;
2714
2715 spin_lock_bh(lock);
2716 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2717 if (seq_sk_match(seq, sk))
2718 return sk;
2719 }
2720 spin_unlock_bh(lock);
2721 }
2722
2723 return NULL;
2724 }
2725
2726 static void *established_get_next(struct seq_file *seq, void *cur)
2727 {
2728 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2729 struct tcp_iter_state *st = seq->private;
2730 struct hlist_nulls_node *node;
2731 struct sock *sk = cur;
2732
2733 ++st->num;
2734 ++st->offset;
2735
2736 sk = sk_nulls_next(sk);
2737
2738 sk_nulls_for_each_from(sk, node) {
2739 if (seq_sk_match(seq, sk))
2740 return sk;
2741 }
2742
2743 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2744 ++st->bucket;
2745 return established_get_first(seq);
2746 }
2747
2748 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2749 {
2750 struct tcp_iter_state *st = seq->private;
2751 void *rc;
2752
2753 st->bucket = 0;
2754 rc = established_get_first(seq);
2755
2756 while (rc && pos) {
2757 rc = established_get_next(seq, rc);
2758 --pos;
2759 }
2760 return rc;
2761 }
2762
2763 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2764 {
2765 void *rc;
2766 struct tcp_iter_state *st = seq->private;
2767
2768 st->state = TCP_SEQ_STATE_LISTENING;
2769 rc = listening_get_idx(seq, &pos);
2770
2771 if (!rc) {
2772 st->state = TCP_SEQ_STATE_ESTABLISHED;
2773 rc = established_get_idx(seq, pos);
2774 }
2775
2776 return rc;
2777 }
2778
2779 static void *tcp_seek_last_pos(struct seq_file *seq)
2780 {
2781 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2782 struct tcp_iter_state *st = seq->private;
2783 int bucket = st->bucket;
2784 int offset = st->offset;
2785 int orig_num = st->num;
2786 void *rc = NULL;
2787
2788 switch (st->state) {
2789 case TCP_SEQ_STATE_LISTENING:
2790 if (st->bucket > hinfo->lhash2_mask)
2791 break;
2792 rc = listening_get_first(seq);
2793 while (offset-- && rc && bucket == st->bucket)
2794 rc = listening_get_next(seq, rc);
2795 if (rc)
2796 break;
2797 st->bucket = 0;
2798 st->state = TCP_SEQ_STATE_ESTABLISHED;
2799 fallthrough;
2800 case TCP_SEQ_STATE_ESTABLISHED:
2801 if (st->bucket > hinfo->ehash_mask)
2802 break;
2803 rc = established_get_first(seq);
2804 while (offset-- && rc && bucket == st->bucket)
2805 rc = established_get_next(seq, rc);
2806 }
2807
2808 st->num = orig_num;
2809
2810 return rc;
2811 }
2812
2813 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2814 {
2815 struct tcp_iter_state *st = seq->private;
2816 void *rc;
2817
2818 if (*pos && *pos == st->last_pos) {
2819 rc = tcp_seek_last_pos(seq);
2820 if (rc)
2821 goto out;
2822 }
2823
2824 st->state = TCP_SEQ_STATE_LISTENING;
2825 st->num = 0;
2826 st->bucket = 0;
2827 st->offset = 0;
2828 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2829
2830 out:
2831 st->last_pos = *pos;
2832 return rc;
2833 }
2834 EXPORT_IPV6_MOD(tcp_seq_start);
2835
2836 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2837 {
2838 struct tcp_iter_state *st = seq->private;
2839 void *rc = NULL;
2840
2841 if (v == SEQ_START_TOKEN) {
2842 rc = tcp_get_idx(seq, 0);
2843 goto out;
2844 }
2845
2846 switch (st->state) {
2847 case TCP_SEQ_STATE_LISTENING:
2848 rc = listening_get_next(seq, v);
2849 if (!rc) {
2850 st->state = TCP_SEQ_STATE_ESTABLISHED;
2851 st->bucket = 0;
2852 st->offset = 0;
2853 rc = established_get_first(seq);
2854 }
2855 break;
2856 case TCP_SEQ_STATE_ESTABLISHED:
2857 rc = established_get_next(seq, v);
2858 break;
2859 }
2860 out:
2861 ++*pos;
2862 st->last_pos = *pos;
2863 return rc;
2864 }
2865 EXPORT_IPV6_MOD(tcp_seq_next);
2866
2867 void tcp_seq_stop(struct seq_file *seq, void *v)
2868 {
2869 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2870 struct tcp_iter_state *st = seq->private;
2871
2872 switch (st->state) {
2873 case TCP_SEQ_STATE_LISTENING:
2874 if (v != SEQ_START_TOKEN)
2875 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2876 break;
2877 case TCP_SEQ_STATE_ESTABLISHED:
2878 if (v)
2879 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2880 break;
2881 }
2882 }
2883 EXPORT_IPV6_MOD(tcp_seq_stop);
2884
2885 static void get_openreq4(const struct request_sock *req,
2886 struct seq_file *f, int i)
2887 {
2888 const struct inet_request_sock *ireq = inet_rsk(req);
2889 long delta = req->rsk_timer.expires - jiffies;
2890
2891 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2892 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2893 i,
2894 ireq->ir_loc_addr,
2895 ireq->ir_num,
2896 ireq->ir_rmt_addr,
2897 ntohs(ireq->ir_rmt_port),
2898 TCP_SYN_RECV,
2899 0, 0, /* could print option size, but that is af dependent. */
2900 1, /* timers active (only the expire timer) */
2901 jiffies_delta_to_clock_t(delta),
2902 req->num_timeout,
2903 from_kuid_munged(seq_user_ns(f),
2904 sk_uid(req->rsk_listener)),
2905 0, /* non standard timer */
2906 0, /* open_requests have no inode */
2907 0,
2908 req);
2909 }
2910
2911 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2912 {
2913 int timer_active;
2914 unsigned long timer_expires;
2915 const struct tcp_sock *tp = tcp_sk(sk);
2916 const struct inet_connection_sock *icsk = inet_csk(sk);
2917 const struct inet_sock *inet = inet_sk(sk);
2918 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2919 __be32 dest = inet->inet_daddr;
2920 __be32 src = inet->inet_rcv_saddr;
2921 __u16 destp = ntohs(inet->inet_dport);
2922 __u16 srcp = ntohs(inet->inet_sport);
2923 u8 icsk_pending;
2924 int rx_queue;
2925 int state;
2926
2927 icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2928 if (icsk_pending == ICSK_TIME_RETRANS ||
2929 icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2930 icsk_pending == ICSK_TIME_LOSS_PROBE) {
2931 timer_active = 1;
2932 timer_expires = icsk_timeout(icsk);
2933 } else if (icsk_pending == ICSK_TIME_PROBE0) {
2934 timer_active = 4;
2935 timer_expires = icsk_timeout(icsk);
2936 } else if (timer_pending(&sk->sk_timer)) {
2937 timer_active = 2;
2938 timer_expires = sk->sk_timer.expires;
2939 } else {
2940 timer_active = 0;
2941 timer_expires = jiffies;
2942 }
2943
2944 state = inet_sk_state_load(sk);
2945 if (state == TCP_LISTEN)
2946 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2947 else
2948 /* Because we don't lock the socket,
2949 * we might find a transient negative value.
2950 */
2951 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2952 READ_ONCE(tp->copied_seq), 0);
2953
2954 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2955 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2956 i, src, srcp, dest, destp, state,
2957 READ_ONCE(tp->write_seq) - tp->snd_una,
2958 rx_queue,
2959 timer_active,
2960 jiffies_delta_to_clock_t(timer_expires - jiffies),
2961 icsk->icsk_retransmits,
2962 from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2963 icsk->icsk_probes_out,
2964 sock_i_ino(sk),
2965 refcount_read(&sk->sk_refcnt), sk,
2966 jiffies_to_clock_t(icsk->icsk_rto),
2967 jiffies_to_clock_t(icsk->icsk_ack.ato),
2968 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2969 tcp_snd_cwnd(tp),
2970 state == TCP_LISTEN ?
2971 fastopenq->max_qlen :
2972 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2973 }
2974
2975 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2976 struct seq_file *f, int i)
2977 {
2978 long delta = tw->tw_timer.expires - jiffies;
2979 __be32 dest, src;
2980 __u16 destp, srcp;
2981
2982 dest = tw->tw_daddr;
2983 src = tw->tw_rcv_saddr;
2984 destp = ntohs(tw->tw_dport);
2985 srcp = ntohs(tw->tw_sport);
2986
2987 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2988 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2989 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2990 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2991 refcount_read(&tw->tw_refcnt), tw);
2992 }
2993
2994 #define TMPSZ 150
2995
2996 static int tcp4_seq_show(struct seq_file *seq, void *v)
2997 {
2998 struct tcp_iter_state *st;
2999 struct sock *sk = v;
3000
3001 seq_setwidth(seq, TMPSZ - 1);
3002 if (v == SEQ_START_TOKEN) {
3003 seq_puts(seq, " sl local_address rem_address st tx_queue "
3004 "rx_queue tr tm->when retrnsmt uid timeout "
3005 "inode");
3006 goto out;
3007 }
3008 st = seq->private;
3009
3010 if (sk->sk_state == TCP_TIME_WAIT)
3011 get_timewait4_sock(v, seq, st->num);
3012 else if (sk->sk_state == TCP_NEW_SYN_RECV)
3013 get_openreq4(v, seq, st->num);
3014 else
3015 get_tcp4_sock(v, seq, st->num);
3016 out:
3017 seq_pad(seq, '\n');
3018 return 0;
3019 }
3020
3021 #ifdef CONFIG_BPF_SYSCALL
3022 union bpf_tcp_iter_batch_item {
3023 struct sock *sk;
3024 __u64 cookie;
3025 };
3026
3027 struct bpf_tcp_iter_state {
3028 struct tcp_iter_state state;
3029 unsigned int cur_sk;
3030 unsigned int end_sk;
3031 unsigned int max_sk;
3032 union bpf_tcp_iter_batch_item *batch;
3033 };
3034
3035 struct bpf_iter__tcp {
3036 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3037 __bpf_md_ptr(struct sock_common *, sk_common);
3038 uid_t uid __aligned(8);
3039 };
3040
3041 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3042 struct sock_common *sk_common, uid_t uid)
3043 {
3044 struct bpf_iter__tcp ctx;
3045
3046 meta->seq_num--; /* skip SEQ_START_TOKEN */
3047 ctx.meta = meta;
3048 ctx.sk_common = sk_common;
3049 ctx.uid = uid;
3050 return bpf_iter_run_prog(prog, &ctx);
3051 }
3052
3053 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3054 {
3055 union bpf_tcp_iter_batch_item *item;
3056 unsigned int cur_sk = iter->cur_sk;
3057 __u64 cookie;
3058
3059 /* Remember the cookies of the sockets we haven't seen yet, so we can
3060 * pick up where we left off next time around.
3061 */
3062 while (cur_sk < iter->end_sk) {
3063 item = &iter->batch[cur_sk++];
3064 cookie = sock_gen_cookie(item->sk);
3065 sock_gen_put(item->sk);
3066 item->cookie = cookie;
3067 }
3068 }
3069
3070 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3071 unsigned int new_batch_sz, gfp_t flags)
3072 {
3073 union bpf_tcp_iter_batch_item *new_batch;
3074
3075 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3076 flags | __GFP_NOWARN);
3077 if (!new_batch)
3078 return -ENOMEM;
3079
3080 memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
3081 kvfree(iter->batch);
3082 iter->batch = new_batch;
3083 iter->max_sk = new_batch_sz;
3084
3085 return 0;
3086 }
3087
3088 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
3089 union bpf_tcp_iter_batch_item *cookies,
3090 int n_cookies)
3091 {
3092 struct hlist_nulls_node *node;
3093 struct sock *sk;
3094 int i;
3095
3096 for (i = 0; i < n_cookies; i++) {
3097 sk = first_sk;
3098 sk_nulls_for_each_from(sk, node)
3099 if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
3100 return sk;
3101 }
3102
3103 return NULL;
3104 }
3105
3106 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
3107 {
3108 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3109 struct bpf_tcp_iter_state *iter = seq->private;
3110 struct tcp_iter_state *st = &iter->state;
3111 unsigned int find_cookie = iter->cur_sk;
3112 unsigned int end_cookie = iter->end_sk;
3113 int resume_bucket = st->bucket;
3114 struct sock *sk;
3115
3116 if (end_cookie && find_cookie == end_cookie)
3117 ++st->bucket;
3118
3119 sk = listening_get_first(seq);
3120 iter->cur_sk = 0;
3121 iter->end_sk = 0;
3122
3123 if (sk && st->bucket == resume_bucket && end_cookie) {
3124 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3125 end_cookie - find_cookie);
3126 if (!sk) {
3127 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3128 ++st->bucket;
3129 sk = listening_get_first(seq);
3130 }
3131 }
3132
3133 return sk;
3134 }
3135
3136 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
3137 {
3138 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3139 struct bpf_tcp_iter_state *iter = seq->private;
3140 struct tcp_iter_state *st = &iter->state;
3141 unsigned int find_cookie = iter->cur_sk;
3142 unsigned int end_cookie = iter->end_sk;
3143 int resume_bucket = st->bucket;
3144 struct sock *sk;
3145
3146 if (end_cookie && find_cookie == end_cookie)
3147 ++st->bucket;
3148
3149 sk = established_get_first(seq);
3150 iter->cur_sk = 0;
3151 iter->end_sk = 0;
3152
3153 if (sk && st->bucket == resume_bucket && end_cookie) {
3154 sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3155 end_cookie - find_cookie);
3156 if (!sk) {
3157 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3158 ++st->bucket;
3159 sk = established_get_first(seq);
3160 }
3161 }
3162
3163 return sk;
3164 }
3165
3166 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3167 {
3168 struct bpf_tcp_iter_state *iter = seq->private;
3169 struct tcp_iter_state *st = &iter->state;
3170 struct sock *sk = NULL;
3171
3172 switch (st->state) {
3173 case TCP_SEQ_STATE_LISTENING:
3174 sk = bpf_iter_tcp_resume_listening(seq);
3175 if (sk)
3176 break;
3177 st->bucket = 0;
3178 st->state = TCP_SEQ_STATE_ESTABLISHED;
3179 fallthrough;
3180 case TCP_SEQ_STATE_ESTABLISHED:
3181 sk = bpf_iter_tcp_resume_established(seq);
3182 break;
3183 }
3184
3185 return sk;
3186 }
3187
3188 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3189 struct sock **start_sk)
3190 {
3191 struct bpf_tcp_iter_state *iter = seq->private;
3192 struct hlist_nulls_node *node;
3193 unsigned int expected = 1;
3194 struct sock *sk;
3195
3196 sock_hold(*start_sk);
3197 iter->batch[iter->end_sk++].sk = *start_sk;
3198
3199 sk = sk_nulls_next(*start_sk);
3200 *start_sk = NULL;
3201 sk_nulls_for_each_from(sk, node) {
3202 if (seq_sk_match(seq, sk)) {
3203 if (iter->end_sk < iter->max_sk) {
3204 sock_hold(sk);
3205 iter->batch[iter->end_sk++].sk = sk;
3206 } else if (!*start_sk) {
3207 /* Remember where we left off. */
3208 *start_sk = sk;
3209 }
3210 expected++;
3211 }
3212 }
3213
3214 return expected;
3215 }
3216
3217 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3218 struct sock **start_sk)
3219 {
3220 struct bpf_tcp_iter_state *iter = seq->private;
3221 struct hlist_nulls_node *node;
3222 unsigned int expected = 1;
3223 struct sock *sk;
3224
3225 sock_hold(*start_sk);
3226 iter->batch[iter->end_sk++].sk = *start_sk;
3227
3228 sk = sk_nulls_next(*start_sk);
3229 *start_sk = NULL;
3230 sk_nulls_for_each_from(sk, node) {
3231 if (seq_sk_match(seq, sk)) {
3232 if (iter->end_sk < iter->max_sk) {
3233 sock_hold(sk);
3234 iter->batch[iter->end_sk++].sk = sk;
3235 } else if (!*start_sk) {
3236 /* Remember where we left off. */
3237 *start_sk = sk;
3238 }
3239 expected++;
3240 }
3241 }
3242
3243 return expected;
3244 }
3245
3246 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3247 struct sock **start_sk)
3248 {
3249 struct bpf_tcp_iter_state *iter = seq->private;
3250 struct tcp_iter_state *st = &iter->state;
3251
3252 if (st->state == TCP_SEQ_STATE_LISTENING)
3253 return bpf_iter_tcp_listening_batch(seq, start_sk);
3254 else
3255 return bpf_iter_tcp_established_batch(seq, start_sk);
3256 }
3257
3258 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3259 {
3260 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3261 struct bpf_tcp_iter_state *iter = seq->private;
3262 struct tcp_iter_state *st = &iter->state;
3263
3264 if (st->state == TCP_SEQ_STATE_LISTENING)
3265 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3266 else
3267 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3268 }
3269
3270 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3271 {
3272 struct bpf_tcp_iter_state *iter = seq->private;
3273 unsigned int expected;
3274 struct sock *sk;
3275 int err;
3276
3277 sk = bpf_iter_tcp_resume(seq);
3278 if (!sk)
3279 return NULL; /* Done */
3280
3281 expected = bpf_iter_fill_batch(seq, &sk);
3282 if (likely(iter->end_sk == expected))
3283 goto done;
3284
3285 /* Batch size was too small. */
3286 bpf_iter_tcp_unlock_bucket(seq);
3287 bpf_iter_tcp_put_batch(iter);
3288 err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3289 GFP_USER);
3290 if (err)
3291 return ERR_PTR(err);
3292
3293 sk = bpf_iter_tcp_resume(seq);
3294 if (!sk)
3295 return NULL; /* Done */
3296
3297 expected = bpf_iter_fill_batch(seq, &sk);
3298 if (likely(iter->end_sk == expected))
3299 goto done;
3300
3301 /* Batch size was still too small. Hold onto the lock while we try
3302 * again with a larger batch to make sure the current bucket's size
3303 * does not change in the meantime.
3304 */
3305 err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3306 if (err) {
3307 bpf_iter_tcp_unlock_bucket(seq);
3308 return ERR_PTR(err);
3309 }
3310
3311 expected = bpf_iter_fill_batch(seq, &sk);
3312 WARN_ON_ONCE(iter->end_sk != expected);
3313 done:
3314 bpf_iter_tcp_unlock_bucket(seq);
3315 return iter->batch[0].sk;
3316 }
3317
3318 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3319 {
3320 /* bpf iter does not support lseek, so it always
3321 * continue from where it was stop()-ped.
3322 */
3323 if (*pos)
3324 return bpf_iter_tcp_batch(seq);
3325
3326 return SEQ_START_TOKEN;
3327 }
3328
3329 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3330 {
3331 struct bpf_tcp_iter_state *iter = seq->private;
3332 struct tcp_iter_state *st = &iter->state;
3333 struct sock *sk;
3334
3335 /* Whenever seq_next() is called, the iter->cur_sk is
3336 * done with seq_show(), so advance to the next sk in
3337 * the batch.
3338 */
3339 if (iter->cur_sk < iter->end_sk) {
3340 /* Keeping st->num consistent in tcp_iter_state.
3341 * bpf_iter_tcp does not use st->num.
3342 * meta.seq_num is used instead.
3343 */
3344 st->num++;
3345 sock_gen_put(iter->batch[iter->cur_sk++].sk);
3346 }
3347
3348 if (iter->cur_sk < iter->end_sk)
3349 sk = iter->batch[iter->cur_sk].sk;
3350 else
3351 sk = bpf_iter_tcp_batch(seq);
3352
3353 ++*pos;
3354 /* Keeping st->last_pos consistent in tcp_iter_state.
3355 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3356 */
3357 st->last_pos = *pos;
3358 return sk;
3359 }
3360
3361 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3362 {
3363 struct bpf_iter_meta meta;
3364 struct bpf_prog *prog;
3365 struct sock *sk = v;
3366 uid_t uid;
3367 int ret;
3368
3369 if (v == SEQ_START_TOKEN)
3370 return 0;
3371
3372 if (sk_fullsock(sk))
3373 lock_sock(sk);
3374
3375 if (unlikely(sk_unhashed(sk))) {
3376 ret = SEQ_SKIP;
3377 goto unlock;
3378 }
3379
3380 if (sk->sk_state == TCP_TIME_WAIT) {
3381 uid = 0;
3382 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3383 const struct request_sock *req = v;
3384
3385 uid = from_kuid_munged(seq_user_ns(seq),
3386 sk_uid(req->rsk_listener));
3387 } else {
3388 uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3389 }
3390
3391 meta.seq = seq;
3392 prog = bpf_iter_get_info(&meta, false);
3393 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3394
3395 unlock:
3396 if (sk_fullsock(sk))
3397 release_sock(sk);
3398 return ret;
3399
3400 }
3401
3402 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3403 {
3404 struct bpf_tcp_iter_state *iter = seq->private;
3405 struct bpf_iter_meta meta;
3406 struct bpf_prog *prog;
3407
3408 if (!v) {
3409 meta.seq = seq;
3410 prog = bpf_iter_get_info(&meta, true);
3411 if (prog)
3412 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3413 }
3414
3415 if (iter->cur_sk < iter->end_sk)
3416 bpf_iter_tcp_put_batch(iter);
3417 }
3418
3419 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3420 .show = bpf_iter_tcp_seq_show,
3421 .start = bpf_iter_tcp_seq_start,
3422 .next = bpf_iter_tcp_seq_next,
3423 .stop = bpf_iter_tcp_seq_stop,
3424 };
3425 #endif
3426 static unsigned short seq_file_family(const struct seq_file *seq)
3427 {
3428 const struct tcp_seq_afinfo *afinfo;
3429
3430 #ifdef CONFIG_BPF_SYSCALL
3431 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3432 if (seq->op == &bpf_iter_tcp_seq_ops)
3433 return AF_UNSPEC;
3434 #endif
3435
3436 /* Iterated from proc fs */
3437 afinfo = pde_data(file_inode(seq->file));
3438 return afinfo->family;
3439 }
3440
3441 static const struct seq_operations tcp4_seq_ops = {
3442 .show = tcp4_seq_show,
3443 .start = tcp_seq_start,
3444 .next = tcp_seq_next,
3445 .stop = tcp_seq_stop,
3446 };
3447
3448 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3449 .family = AF_INET,
3450 };
3451
3452 static int __net_init tcp4_proc_init_net(struct net *net)
3453 {
3454 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3455 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3456 return -ENOMEM;
3457 return 0;
3458 }
3459
3460 static void __net_exit tcp4_proc_exit_net(struct net *net)
3461 {
3462 remove_proc_entry("tcp", net->proc_net);
3463 }
3464
3465 static struct pernet_operations tcp4_net_ops = {
3466 .init = tcp4_proc_init_net,
3467 .exit = tcp4_proc_exit_net,
3468 };
3469
3470 int __init tcp4_proc_init(void)
3471 {
3472 return register_pernet_subsys(&tcp4_net_ops);
3473 }
3474
3475 void tcp4_proc_exit(void)
3476 {
3477 unregister_pernet_subsys(&tcp4_net_ops);
3478 }
3479 #endif /* CONFIG_PROC_FS */
3480
3481 /* @wake is one when sk_stream_write_space() calls us.
3482 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3483 * This mimics the strategy used in sock_def_write_space().
3484 */
3485 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3486 {
3487 const struct tcp_sock *tp = tcp_sk(sk);
3488 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3489 READ_ONCE(tp->snd_nxt);
3490
3491 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3492 }
3493 EXPORT_SYMBOL(tcp_stream_memory_free);
3494
3495 struct proto tcp_prot = {
3496 .name = "TCP",
3497 .owner = THIS_MODULE,
3498 .close = tcp_close,
3499 .pre_connect = tcp_v4_pre_connect,
3500 .connect = tcp_v4_connect,
3501 .disconnect = tcp_disconnect,
3502 .accept = inet_csk_accept,
3503 .ioctl = tcp_ioctl,
3504 .init = tcp_v4_init_sock,
3505 .destroy = tcp_v4_destroy_sock,
3506 .shutdown = tcp_shutdown,
3507 .setsockopt = tcp_setsockopt,
3508 .getsockopt = tcp_getsockopt,
3509 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3510 .keepalive = tcp_set_keepalive,
3511 .recvmsg = tcp_recvmsg,
3512 .sendmsg = tcp_sendmsg,
3513 .splice_eof = tcp_splice_eof,
3514 .backlog_rcv = tcp_v4_do_rcv,
3515 .release_cb = tcp_release_cb,
3516 .hash = inet_hash,
3517 .unhash = inet_unhash,
3518 .get_port = inet_csk_get_port,
3519 .put_port = inet_put_port,
3520 #ifdef CONFIG_BPF_SYSCALL
3521 .psock_update_sk_prot = tcp_bpf_update_proto,
3522 #endif
3523 .enter_memory_pressure = tcp_enter_memory_pressure,
3524 .leave_memory_pressure = tcp_leave_memory_pressure,
3525 .stream_memory_free = tcp_stream_memory_free,
3526 .sockets_allocated = &tcp_sockets_allocated,
3527 .orphan_count = &tcp_orphan_count,
3528
3529 .memory_allocated = &net_aligned_data.tcp_memory_allocated,
3530 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3531
3532 .memory_pressure = &tcp_memory_pressure,
3533 .sysctl_mem = sysctl_tcp_mem,
3534 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3535 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3536 .max_header = MAX_TCP_HEADER,
3537 .obj_size = sizeof(struct tcp_sock),
3538 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3539 .twsk_prot = &tcp_timewait_sock_ops,
3540 .rsk_prot = &tcp_request_sock_ops,
3541 .h.hashinfo = NULL,
3542 .no_autobind = true,
3543 .diag_destroy = tcp_abort,
3544 };
3545 EXPORT_SYMBOL(tcp_prot);
3546
3547 static void __net_exit tcp_sk_exit(struct net *net)
3548 {
3549 if (net->ipv4.tcp_congestion_control)
3550 bpf_module_put(net->ipv4.tcp_congestion_control,
3551 net->ipv4.tcp_congestion_control->owner);
3552 }
3553
3554 static void __net_init tcp_set_hashinfo(struct net *net)
3555 {
3556 struct inet_hashinfo *hinfo;
3557 unsigned int ehash_entries;
3558 struct net *old_net;
3559
3560 if (net_eq(net, &init_net))
3561 goto fallback;
3562
3563 old_net = current->nsproxy->net_ns;
3564 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3565 if (!ehash_entries)
3566 goto fallback;
3567
3568 ehash_entries = roundup_pow_of_two(ehash_entries);
3569 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3570 if (!hinfo) {
3571 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3572 "for a netns, fallback to the global one\n",
3573 ehash_entries);
3574 fallback:
3575 hinfo = &tcp_hashinfo;
3576 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3577 }
3578
3579 net->ipv4.tcp_death_row.hashinfo = hinfo;
3580 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3581 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3582 }
3583
3584 static int __net_init tcp_sk_init(struct net *net)
3585 {
3586 net->ipv4.sysctl_tcp_ecn = 2;
3587 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3588
3589 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3590 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3591 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3592 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3593 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3594
3595 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3596 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3597 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3598
3599 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3600 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3601 net->ipv4.sysctl_tcp_syncookies = 1;
3602 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3603 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3604 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3605 net->ipv4.sysctl_tcp_orphan_retries = 0;
3606 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3607 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3608 net->ipv4.sysctl_tcp_tw_reuse = 2;
3609 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3610 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3611
3612 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3613 tcp_set_hashinfo(net);
3614
3615 net->ipv4.sysctl_tcp_sack = 1;
3616 net->ipv4.sysctl_tcp_window_scaling = 1;
3617 net->ipv4.sysctl_tcp_timestamps = 1;
3618 net->ipv4.sysctl_tcp_early_retrans = 3;
3619 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3620 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3621 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3622 net->ipv4.sysctl_tcp_max_reordering = 300;
3623 net->ipv4.sysctl_tcp_dsack = 1;
3624 net->ipv4.sysctl_tcp_app_win = 31;
3625 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3626 net->ipv4.sysctl_tcp_frto = 2;
3627 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3628 /* This limits the percentage of the congestion window which we
3629 * will allow a single TSO frame to consume. Building TSO frames
3630 * which are too large can cause TCP streams to be bursty.
3631 */
3632 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3633 /* Default TSQ limit of 4 MB */
3634 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3635
3636 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3637 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3638
3639 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3640 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3641 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3642 net->ipv4.sysctl_tcp_autocorking = 1;
3643 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3644 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3645 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3646 if (net != &init_net) {
3647 memcpy(net->ipv4.sysctl_tcp_rmem,
3648 init_net.ipv4.sysctl_tcp_rmem,
3649 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3650 memcpy(net->ipv4.sysctl_tcp_wmem,
3651 init_net.ipv4.sysctl_tcp_wmem,
3652 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3653 }
3654 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3655 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3656 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3657 net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3658 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3659 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3660 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3661
3662 /* Set default values for PLB */
3663 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3664 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3665 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3666 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3667 /* Default congestion threshold for PLB to mark a round is 50% */
3668 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3669
3670 /* Reno is always built in */
3671 if (!net_eq(net, &init_net) &&
3672 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3673 init_net.ipv4.tcp_congestion_control->owner))
3674 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3675 else
3676 net->ipv4.tcp_congestion_control = &tcp_reno;
3677
3678 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3679 net->ipv4.sysctl_tcp_shrink_window = 0;
3680
3681 net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3682 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3683 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3684
3685 return 0;
3686 }
3687
3688 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3689 {
3690 struct net *net;
3691
3692 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3693 * and failed setup_net error unwinding path are serialized.
3694 *
3695 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3696 * net_exit_list, the thread that dismantles a particular twsk must
3697 * do so without other thread progressing to refcount_dec_and_test() of
3698 * tcp_death_row.tw_refcount.
3699 */
3700 mutex_lock(&tcp_exit_batch_mutex);
3701
3702 tcp_twsk_purge(net_exit_list);
3703
3704 list_for_each_entry(net, net_exit_list, exit_list) {
3705 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3706 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3707 tcp_fastopen_ctx_destroy(net);
3708 }
3709
3710 mutex_unlock(&tcp_exit_batch_mutex);
3711 }
3712
3713 static struct pernet_operations __net_initdata tcp_sk_ops = {
3714 .init = tcp_sk_init,
3715 .exit = tcp_sk_exit,
3716 .exit_batch = tcp_sk_exit_batch,
3717 };
3718
3719 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3720 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3721 struct sock_common *sk_common, uid_t uid)
3722
3723 #define INIT_BATCH_SZ 16
3724
3725 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3726 {
3727 struct bpf_tcp_iter_state *iter = priv_data;
3728 int err;
3729
3730 err = bpf_iter_init_seq_net(priv_data, aux);
3731 if (err)
3732 return err;
3733
3734 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3735 if (err) {
3736 bpf_iter_fini_seq_net(priv_data);
3737 return err;
3738 }
3739
3740 return 0;
3741 }
3742
3743 static void bpf_iter_fini_tcp(void *priv_data)
3744 {
3745 struct bpf_tcp_iter_state *iter = priv_data;
3746
3747 bpf_iter_fini_seq_net(priv_data);
3748 kvfree(iter->batch);
3749 }
3750
3751 static const struct bpf_iter_seq_info tcp_seq_info = {
3752 .seq_ops = &bpf_iter_tcp_seq_ops,
3753 .init_seq_private = bpf_iter_init_tcp,
3754 .fini_seq_private = bpf_iter_fini_tcp,
3755 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3756 };
3757
3758 static const struct bpf_func_proto *
3759 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3760 const struct bpf_prog *prog)
3761 {
3762 switch (func_id) {
3763 case BPF_FUNC_setsockopt:
3764 return &bpf_sk_setsockopt_proto;
3765 case BPF_FUNC_getsockopt:
3766 return &bpf_sk_getsockopt_proto;
3767 default:
3768 return NULL;
3769 }
3770 }
3771
3772 static struct bpf_iter_reg tcp_reg_info = {
3773 .target = "tcp",
3774 .ctx_arg_info_size = 1,
3775 .ctx_arg_info = {
3776 { offsetof(struct bpf_iter__tcp, sk_common),
3777 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3778 },
3779 .get_func_proto = bpf_iter_tcp_get_func_proto,
3780 .seq_info = &tcp_seq_info,
3781 };
3782
3783 static void __init bpf_iter_register(void)
3784 {
3785 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3786 if (bpf_iter_reg_target(&tcp_reg_info))
3787 pr_warn("Warning: could not register bpf iterator tcp\n");
3788 }
3789
3790 #endif
3791
3792 void __init tcp_v4_init(void)
3793 {
3794 int cpu, res;
3795
3796 for_each_possible_cpu(cpu) {
3797 struct sock *sk;
3798
3799 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3800 IPPROTO_TCP, &init_net);
3801 if (res)
3802 panic("Failed to create the TCP control socket.\n");
3803 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3804
3805 /* Please enforce IP_DF and IPID==0 for RST and
3806 * ACK sent in SYN-RECV and TIME-WAIT state.
3807 */
3808 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3809
3810 sk->sk_clockid = CLOCK_MONOTONIC;
3811
3812 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3813 }
3814 if (register_pernet_subsys(&tcp_sk_ops))
3815 panic("Failed to create the TCP control socket.\n");
3816
3817 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3818 bpf_iter_register();
3819 #endif
3820 }
3821