1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 #include <linux/skbuff_ref.h>
83
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86
87 #include <trace/events/tcp.h>
88
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96
97 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98 .bh_lock = INIT_LOCAL_LOCK(bh_lock),
99 };
100
101 static DEFINE_MUTEX(tcp_exit_batch_mutex);
102
tcp_v4_init_seq(const struct sk_buff * skb)103 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
104 {
105 return secure_tcp_seq(ip_hdr(skb)->daddr,
106 ip_hdr(skb)->saddr,
107 tcp_hdr(skb)->dest,
108 tcp_hdr(skb)->source);
109 }
110
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)111 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
112 {
113 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
114 }
115
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 {
118 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
119 const struct inet_timewait_sock *tw = inet_twsk(sktw);
120 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
121 struct tcp_sock *tp = tcp_sk(sk);
122 int ts_recent_stamp;
123 u32 reuse_thresh;
124
125 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
126 reuse = 0;
127
128 if (reuse == 2) {
129 /* Still does not detect *everything* that goes through
130 * lo, since we require a loopback src or dst address
131 * or direct binding to 'lo' interface.
132 */
133 bool loopback = false;
134 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
135 loopback = true;
136 #if IS_ENABLED(CONFIG_IPV6)
137 if (tw->tw_family == AF_INET6) {
138 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
139 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
140 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
141 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
142 loopback = true;
143 } else
144 #endif
145 {
146 if (ipv4_is_loopback(tw->tw_daddr) ||
147 ipv4_is_loopback(tw->tw_rcv_saddr))
148 loopback = true;
149 }
150 if (!loopback)
151 reuse = 0;
152 }
153
154 /* With PAWS, it is safe from the viewpoint
155 of data integrity. Even without PAWS it is safe provided sequence
156 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
157
158 Actually, the idea is close to VJ's one, only timestamp cache is
159 held not per host, but per port pair and TW bucket is used as state
160 holder.
161
162 If TW bucket has been already destroyed we fall back to VJ's scheme
163 and use initial timestamp retrieved from peer table.
164 */
165 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
166 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
167 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
168 if (ts_recent_stamp &&
169 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
170 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
171 * and releasing the bucket lock.
172 */
173 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
174 return 0;
175
176 /* In case of repair and re-using TIME-WAIT sockets we still
177 * want to be sure that it is safe as above but honor the
178 * sequence numbers and time stamps set as part of the repair
179 * process.
180 *
181 * Without this check re-using a TIME-WAIT socket with TCP
182 * repair would accumulate a -1 on the repair assigned
183 * sequence number. The first time it is reused the sequence
184 * is -1, the second time -2, etc. This fixes that issue
185 * without appearing to create any others.
186 */
187 if (likely(!tp->repair)) {
188 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
189
190 if (!seq)
191 seq = 1;
192 WRITE_ONCE(tp->write_seq, seq);
193 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
194 tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
195 }
196
197 return 1;
198 }
199
200 return 0;
201 }
202 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
203
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)204 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
205 int addr_len)
206 {
207 /* This check is replicated from tcp_v4_connect() and intended to
208 * prevent BPF program called below from accessing bytes that are out
209 * of the bound specified by user in addr_len.
210 */
211 if (addr_len < sizeof(struct sockaddr_in))
212 return -EINVAL;
213
214 sock_owned_by_me(sk);
215
216 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
217 }
218
219 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)220 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
221 {
222 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
223 struct inet_timewait_death_row *tcp_death_row;
224 struct inet_sock *inet = inet_sk(sk);
225 struct tcp_sock *tp = tcp_sk(sk);
226 struct ip_options_rcu *inet_opt;
227 struct net *net = sock_net(sk);
228 __be16 orig_sport, orig_dport;
229 __be32 daddr, nexthop;
230 struct flowi4 *fl4;
231 struct rtable *rt;
232 int err;
233
234 if (addr_len < sizeof(struct sockaddr_in))
235 return -EINVAL;
236
237 if (usin->sin_family != AF_INET)
238 return -EAFNOSUPPORT;
239
240 nexthop = daddr = usin->sin_addr.s_addr;
241 inet_opt = rcu_dereference_protected(inet->inet_opt,
242 lockdep_sock_is_held(sk));
243 if (inet_opt && inet_opt->opt.srr) {
244 if (!daddr)
245 return -EINVAL;
246 nexthop = inet_opt->opt.faddr;
247 }
248
249 orig_sport = inet->inet_sport;
250 orig_dport = usin->sin_port;
251 fl4 = &inet->cork.fl.u.ip4;
252 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
253 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
254 orig_dport, sk);
255 if (IS_ERR(rt)) {
256 err = PTR_ERR(rt);
257 if (err == -ENETUNREACH)
258 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
259 return err;
260 }
261
262 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
263 ip_rt_put(rt);
264 return -ENETUNREACH;
265 }
266
267 if (!inet_opt || !inet_opt->opt.srr)
268 daddr = fl4->daddr;
269
270 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
271
272 if (!inet->inet_saddr) {
273 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
274 if (err) {
275 ip_rt_put(rt);
276 return err;
277 }
278 } else {
279 sk_rcv_saddr_set(sk, inet->inet_saddr);
280 }
281
282 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
283 /* Reset inherited state */
284 tp->rx_opt.ts_recent = 0;
285 tp->rx_opt.ts_recent_stamp = 0;
286 if (likely(!tp->repair))
287 WRITE_ONCE(tp->write_seq, 0);
288 }
289
290 inet->inet_dport = usin->sin_port;
291 sk_daddr_set(sk, daddr);
292
293 inet_csk(sk)->icsk_ext_hdr_len = 0;
294 if (inet_opt)
295 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
296
297 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
298
299 /* Socket identity is still unknown (sport may be zero).
300 * However we set state to SYN-SENT and not releasing socket
301 * lock select source port, enter ourselves into the hash tables and
302 * complete initialization after this.
303 */
304 tcp_set_state(sk, TCP_SYN_SENT);
305 err = inet_hash_connect(tcp_death_row, sk);
306 if (err)
307 goto failure;
308
309 sk_set_txhash(sk);
310
311 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
312 inet->inet_sport, inet->inet_dport, sk);
313 if (IS_ERR(rt)) {
314 err = PTR_ERR(rt);
315 rt = NULL;
316 goto failure;
317 }
318 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
319 /* OK, now commit destination to socket. */
320 sk->sk_gso_type = SKB_GSO_TCPV4;
321 sk_setup_caps(sk, &rt->dst);
322 rt = NULL;
323
324 if (likely(!tp->repair)) {
325 if (!tp->write_seq)
326 WRITE_ONCE(tp->write_seq,
327 secure_tcp_seq(inet->inet_saddr,
328 inet->inet_daddr,
329 inet->inet_sport,
330 usin->sin_port));
331 WRITE_ONCE(tp->tsoffset,
332 secure_tcp_ts_off(net, inet->inet_saddr,
333 inet->inet_daddr));
334 }
335
336 atomic_set(&inet->inet_id, get_random_u16());
337
338 if (tcp_fastopen_defer_connect(sk, &err))
339 return err;
340 if (err)
341 goto failure;
342
343 err = tcp_connect(sk);
344
345 if (err)
346 goto failure;
347
348 return 0;
349
350 failure:
351 /*
352 * This unhashes the socket and releases the local port,
353 * if necessary.
354 */
355 tcp_set_state(sk, TCP_CLOSE);
356 inet_bhash2_reset_saddr(sk);
357 ip_rt_put(rt);
358 sk->sk_route_caps = 0;
359 inet->inet_dport = 0;
360 return err;
361 }
362 EXPORT_SYMBOL(tcp_v4_connect);
363
364 /*
365 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
366 * It can be called through tcp_release_cb() if socket was owned by user
367 * at the time tcp_v4_err() was called to handle ICMP message.
368 */
tcp_v4_mtu_reduced(struct sock * sk)369 void tcp_v4_mtu_reduced(struct sock *sk)
370 {
371 struct inet_sock *inet = inet_sk(sk);
372 struct dst_entry *dst;
373 u32 mtu;
374
375 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
376 return;
377 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
378 dst = inet_csk_update_pmtu(sk, mtu);
379 if (!dst)
380 return;
381
382 /* Something is about to be wrong... Remember soft error
383 * for the case, if this connection will not able to recover.
384 */
385 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
386 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
387
388 mtu = dst_mtu(dst);
389
390 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
391 ip_sk_accept_pmtu(sk) &&
392 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
393 tcp_sync_mss(sk, mtu);
394
395 /* Resend the TCP packet because it's
396 * clear that the old packet has been
397 * dropped. This is the new "fast" path mtu
398 * discovery.
399 */
400 tcp_simple_retransmit(sk);
401 } /* else let the usual retransmit timer handle it */
402 }
403 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
404
do_redirect(struct sk_buff * skb,struct sock * sk)405 static void do_redirect(struct sk_buff *skb, struct sock *sk)
406 {
407 struct dst_entry *dst = __sk_dst_check(sk, 0);
408
409 if (dst)
410 dst->ops->redirect(dst, sk, skb);
411 }
412
413
414 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)415 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
416 {
417 struct request_sock *req = inet_reqsk(sk);
418 struct net *net = sock_net(sk);
419
420 /* ICMPs are not backlogged, hence we cannot get
421 * an established socket here.
422 */
423 if (seq != tcp_rsk(req)->snt_isn) {
424 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
425 } else if (abort) {
426 /*
427 * Still in SYN_RECV, just remove it silently.
428 * There is no good way to pass the error to the newly
429 * created socket, and POSIX does not want network
430 * errors returned from accept().
431 */
432 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
433 tcp_listendrop(req->rsk_listener);
434 }
435 reqsk_put(req);
436 }
437 EXPORT_SYMBOL(tcp_req_err);
438
439 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)440 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
441 {
442 struct inet_connection_sock *icsk = inet_csk(sk);
443 struct tcp_sock *tp = tcp_sk(sk);
444 struct sk_buff *skb;
445 s32 remaining;
446 u32 delta_us;
447
448 if (sock_owned_by_user(sk))
449 return;
450
451 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
452 !icsk->icsk_backoff)
453 return;
454
455 skb = tcp_rtx_queue_head(sk);
456 if (WARN_ON_ONCE(!skb))
457 return;
458
459 icsk->icsk_backoff--;
460 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
461 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
462
463 tcp_mstamp_refresh(tp);
464 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
465 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
466
467 if (remaining > 0) {
468 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
469 remaining, TCP_RTO_MAX);
470 } else {
471 /* RTO revert clocked out retransmission.
472 * Will retransmit now.
473 */
474 tcp_retransmit_timer(sk);
475 }
476 }
477 EXPORT_SYMBOL(tcp_ld_RTO_revert);
478
479 /*
480 * This routine is called by the ICMP module when it gets some
481 * sort of error condition. If err < 0 then the socket should
482 * be closed and the error returned to the user. If err > 0
483 * it's just the icmp type << 8 | icmp code. After adjustment
484 * header points to the first 8 bytes of the tcp header. We need
485 * to find the appropriate port.
486 *
487 * The locking strategy used here is very "optimistic". When
488 * someone else accesses the socket the ICMP is just dropped
489 * and for some paths there is no check at all.
490 * A more general error queue to queue errors for later handling
491 * is probably better.
492 *
493 */
494
tcp_v4_err(struct sk_buff * skb,u32 info)495 int tcp_v4_err(struct sk_buff *skb, u32 info)
496 {
497 const struct iphdr *iph = (const struct iphdr *)skb->data;
498 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
499 struct tcp_sock *tp;
500 const int type = icmp_hdr(skb)->type;
501 const int code = icmp_hdr(skb)->code;
502 struct sock *sk;
503 struct request_sock *fastopen;
504 u32 seq, snd_una;
505 int err;
506 struct net *net = dev_net(skb->dev);
507
508 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
509 iph->daddr, th->dest, iph->saddr,
510 ntohs(th->source), inet_iif(skb), 0);
511 if (!sk) {
512 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
513 return -ENOENT;
514 }
515 if (sk->sk_state == TCP_TIME_WAIT) {
516 /* To increase the counter of ignored icmps for TCP-AO */
517 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
518 inet_twsk_put(inet_twsk(sk));
519 return 0;
520 }
521 seq = ntohl(th->seq);
522 if (sk->sk_state == TCP_NEW_SYN_RECV) {
523 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
524 type == ICMP_TIME_EXCEEDED ||
525 (type == ICMP_DEST_UNREACH &&
526 (code == ICMP_NET_UNREACH ||
527 code == ICMP_HOST_UNREACH)));
528 return 0;
529 }
530
531 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
532 sock_put(sk);
533 return 0;
534 }
535
536 bh_lock_sock(sk);
537 /* If too many ICMPs get dropped on busy
538 * servers this needs to be solved differently.
539 * We do take care of PMTU discovery (RFC1191) special case :
540 * we can receive locally generated ICMP messages while socket is held.
541 */
542 if (sock_owned_by_user(sk)) {
543 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
544 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
545 }
546 if (sk->sk_state == TCP_CLOSE)
547 goto out;
548
549 if (static_branch_unlikely(&ip4_min_ttl)) {
550 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
551 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
552 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
553 goto out;
554 }
555 }
556
557 tp = tcp_sk(sk);
558 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
559 fastopen = rcu_dereference(tp->fastopen_rsk);
560 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
561 if (sk->sk_state != TCP_LISTEN &&
562 !between(seq, snd_una, tp->snd_nxt)) {
563 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
564 goto out;
565 }
566
567 switch (type) {
568 case ICMP_REDIRECT:
569 if (!sock_owned_by_user(sk))
570 do_redirect(skb, sk);
571 goto out;
572 case ICMP_SOURCE_QUENCH:
573 /* Just silently ignore these. */
574 goto out;
575 case ICMP_PARAMETERPROB:
576 err = EPROTO;
577 break;
578 case ICMP_DEST_UNREACH:
579 if (code > NR_ICMP_UNREACH)
580 goto out;
581
582 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
583 /* We are not interested in TCP_LISTEN and open_requests
584 * (SYN-ACKs send out by Linux are always <576bytes so
585 * they should go through unfragmented).
586 */
587 if (sk->sk_state == TCP_LISTEN)
588 goto out;
589
590 WRITE_ONCE(tp->mtu_info, info);
591 if (!sock_owned_by_user(sk)) {
592 tcp_v4_mtu_reduced(sk);
593 } else {
594 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
595 sock_hold(sk);
596 }
597 goto out;
598 }
599
600 err = icmp_err_convert[code].errno;
601 /* check if this ICMP message allows revert of backoff.
602 * (see RFC 6069)
603 */
604 if (!fastopen &&
605 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
606 tcp_ld_RTO_revert(sk, seq);
607 break;
608 case ICMP_TIME_EXCEEDED:
609 err = EHOSTUNREACH;
610 break;
611 default:
612 goto out;
613 }
614
615 switch (sk->sk_state) {
616 case TCP_SYN_SENT:
617 case TCP_SYN_RECV:
618 /* Only in fast or simultaneous open. If a fast open socket is
619 * already accepted it is treated as a connected one below.
620 */
621 if (fastopen && !fastopen->sk)
622 break;
623
624 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
625
626 if (!sock_owned_by_user(sk))
627 tcp_done_with_error(sk, err);
628 else
629 WRITE_ONCE(sk->sk_err_soft, err);
630 goto out;
631 }
632
633 /* If we've already connected we will keep trying
634 * until we time out, or the user gives up.
635 *
636 * rfc1122 4.2.3.9 allows to consider as hard errors
637 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
638 * but it is obsoleted by pmtu discovery).
639 *
640 * Note, that in modern internet, where routing is unreliable
641 * and in each dark corner broken firewalls sit, sending random
642 * errors ordered by their masters even this two messages finally lose
643 * their original sense (even Linux sends invalid PORT_UNREACHs)
644 *
645 * Now we are in compliance with RFCs.
646 * --ANK (980905)
647 */
648
649 if (!sock_owned_by_user(sk) &&
650 inet_test_bit(RECVERR, sk)) {
651 WRITE_ONCE(sk->sk_err, err);
652 sk_error_report(sk);
653 } else { /* Only an error on timeout */
654 WRITE_ONCE(sk->sk_err_soft, err);
655 }
656
657 out:
658 bh_unlock_sock(sk);
659 sock_put(sk);
660 return 0;
661 }
662
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)663 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
664 {
665 struct tcphdr *th = tcp_hdr(skb);
666
667 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
668 skb->csum_start = skb_transport_header(skb) - skb->head;
669 skb->csum_offset = offsetof(struct tcphdr, check);
670 }
671
672 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)673 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
674 {
675 const struct inet_sock *inet = inet_sk(sk);
676
677 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
678 }
679 EXPORT_SYMBOL(tcp_v4_send_check);
680
681 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
682
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])683 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
684 const struct tcp_ao_hdr *aoh,
685 struct ip_reply_arg *arg, struct tcphdr *reply,
686 __be32 reply_options[REPLY_OPTIONS_LEN])
687 {
688 #ifdef CONFIG_TCP_AO
689 int sdif = tcp_v4_sdif(skb);
690 int dif = inet_iif(skb);
691 int l3index = sdif ? dif : 0;
692 bool allocated_traffic_key;
693 struct tcp_ao_key *key;
694 char *traffic_key;
695 bool drop = true;
696 u32 ao_sne = 0;
697 u8 keyid;
698
699 rcu_read_lock();
700 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
701 &key, &traffic_key, &allocated_traffic_key,
702 &keyid, &ao_sne))
703 goto out;
704
705 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
706 (aoh->rnext_keyid << 8) | keyid);
707 arg->iov[0].iov_len += tcp_ao_len_aligned(key);
708 reply->doff = arg->iov[0].iov_len / 4;
709
710 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
711 key, traffic_key,
712 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
713 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
714 reply, ao_sne))
715 goto out;
716 drop = false;
717 out:
718 rcu_read_unlock();
719 if (allocated_traffic_key)
720 kfree(traffic_key);
721 return drop;
722 #else
723 return true;
724 #endif
725 }
726
727 /*
728 * This routine will send an RST to the other tcp.
729 *
730 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
731 * for reset.
732 * Answer: if a packet caused RST, it is not for a socket
733 * existing in our system, if it is matched to a socket,
734 * it is just duplicate segment or bug in other side's TCP.
735 * So that we build reply only basing on parameters
736 * arrived with segment.
737 * Exception: precedence violation. We do not implement it in any case.
738 */
739
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)740 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
741 enum sk_rst_reason reason)
742 {
743 const struct tcphdr *th = tcp_hdr(skb);
744 struct {
745 struct tcphdr th;
746 __be32 opt[REPLY_OPTIONS_LEN];
747 } rep;
748 const __u8 *md5_hash_location = NULL;
749 const struct tcp_ao_hdr *aoh;
750 struct ip_reply_arg arg;
751 #ifdef CONFIG_TCP_MD5SIG
752 struct tcp_md5sig_key *key = NULL;
753 unsigned char newhash[16];
754 struct sock *sk1 = NULL;
755 int genhash;
756 #endif
757 u64 transmit_time = 0;
758 struct sock *ctl_sk;
759 struct net *net;
760 u32 txhash = 0;
761
762 /* Never send a reset in response to a reset. */
763 if (th->rst)
764 return;
765
766 /* If sk not NULL, it means we did a successful lookup and incoming
767 * route had to be correct. prequeue might have dropped our dst.
768 */
769 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
770 return;
771
772 /* Swap the send and the receive. */
773 memset(&rep, 0, sizeof(rep));
774 rep.th.dest = th->source;
775 rep.th.source = th->dest;
776 rep.th.doff = sizeof(struct tcphdr) / 4;
777 rep.th.rst = 1;
778
779 if (th->ack) {
780 rep.th.seq = th->ack_seq;
781 } else {
782 rep.th.ack = 1;
783 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
784 skb->len - (th->doff << 2));
785 }
786
787 memset(&arg, 0, sizeof(arg));
788 arg.iov[0].iov_base = (unsigned char *)&rep;
789 arg.iov[0].iov_len = sizeof(rep.th);
790
791 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
792
793 /* Invalid TCP option size or twice included auth */
794 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
795 return;
796
797 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
798 return;
799
800 #ifdef CONFIG_TCP_MD5SIG
801 rcu_read_lock();
802 if (sk && sk_fullsock(sk)) {
803 const union tcp_md5_addr *addr;
804 int l3index;
805
806 /* sdif set, means packet ingressed via a device
807 * in an L3 domain and inet_iif is set to it.
808 */
809 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
810 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
811 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
812 } else if (md5_hash_location) {
813 const union tcp_md5_addr *addr;
814 int sdif = tcp_v4_sdif(skb);
815 int dif = inet_iif(skb);
816 int l3index;
817
818 /*
819 * active side is lost. Try to find listening socket through
820 * source port, and then find md5 key through listening socket.
821 * we are not loose security here:
822 * Incoming packet is checked with md5 hash with finding key,
823 * no RST generated if md5 hash doesn't match.
824 */
825 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
826 NULL, 0, ip_hdr(skb)->saddr,
827 th->source, ip_hdr(skb)->daddr,
828 ntohs(th->source), dif, sdif);
829 /* don't send rst if it can't find key */
830 if (!sk1)
831 goto out;
832
833 /* sdif set, means packet ingressed via a device
834 * in an L3 domain and dif is set to it.
835 */
836 l3index = sdif ? dif : 0;
837 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
838 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
839 if (!key)
840 goto out;
841
842
843 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
844 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
845 goto out;
846
847 }
848
849 if (key) {
850 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
851 (TCPOPT_NOP << 16) |
852 (TCPOPT_MD5SIG << 8) |
853 TCPOLEN_MD5SIG);
854 /* Update length and the length the header thinks exists */
855 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
856 rep.th.doff = arg.iov[0].iov_len / 4;
857
858 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
859 key, ip_hdr(skb)->saddr,
860 ip_hdr(skb)->daddr, &rep.th);
861 }
862 #endif
863 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
864 if (rep.opt[0] == 0) {
865 __be32 mrst = mptcp_reset_option(skb);
866
867 if (mrst) {
868 rep.opt[0] = mrst;
869 arg.iov[0].iov_len += sizeof(mrst);
870 rep.th.doff = arg.iov[0].iov_len / 4;
871 }
872 }
873
874 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
875 ip_hdr(skb)->saddr, /* XXX */
876 arg.iov[0].iov_len, IPPROTO_TCP, 0);
877 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
878 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
879
880 /* When socket is gone, all binding information is lost.
881 * routing might fail in this case. No choice here, if we choose to force
882 * input interface, we will misroute in case of asymmetric route.
883 */
884 if (sk)
885 arg.bound_dev_if = sk->sk_bound_dev_if;
886
887 trace_tcp_send_reset(sk, skb, reason);
888
889 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
890 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
891
892 arg.tos = ip_hdr(skb)->tos;
893 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
894 local_bh_disable();
895 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
896 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
897
898 sock_net_set(ctl_sk, net);
899 if (sk) {
900 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
901 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
902 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
903 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
904 transmit_time = tcp_transmit_time(sk);
905 xfrm_sk_clone_policy(ctl_sk, sk);
906 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
907 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
908 } else {
909 ctl_sk->sk_mark = 0;
910 ctl_sk->sk_priority = 0;
911 }
912 ip_send_unicast_reply(ctl_sk, sk,
913 skb, &TCP_SKB_CB(skb)->header.h4.opt,
914 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
915 &arg, arg.iov[0].iov_len,
916 transmit_time, txhash);
917
918 xfrm_sk_free_policy(ctl_sk);
919 sock_net_set(ctl_sk, &init_net);
920 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
921 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
922 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
923 local_bh_enable();
924
925 #ifdef CONFIG_TCP_MD5SIG
926 out:
927 rcu_read_unlock();
928 #endif
929 }
930
931 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
932 outside socket context is ugly, certainly. What can I do?
933 */
934
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)935 static void tcp_v4_send_ack(const struct sock *sk,
936 struct sk_buff *skb, u32 seq, u32 ack,
937 u32 win, u32 tsval, u32 tsecr, int oif,
938 struct tcp_key *key,
939 int reply_flags, u8 tos, u32 txhash)
940 {
941 const struct tcphdr *th = tcp_hdr(skb);
942 struct {
943 struct tcphdr th;
944 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
945 } rep;
946 struct net *net = sock_net(sk);
947 struct ip_reply_arg arg;
948 struct sock *ctl_sk;
949 u64 transmit_time;
950
951 memset(&rep.th, 0, sizeof(struct tcphdr));
952 memset(&arg, 0, sizeof(arg));
953
954 arg.iov[0].iov_base = (unsigned char *)&rep;
955 arg.iov[0].iov_len = sizeof(rep.th);
956 if (tsecr) {
957 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
958 (TCPOPT_TIMESTAMP << 8) |
959 TCPOLEN_TIMESTAMP);
960 rep.opt[1] = htonl(tsval);
961 rep.opt[2] = htonl(tsecr);
962 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
963 }
964
965 /* Swap the send and the receive. */
966 rep.th.dest = th->source;
967 rep.th.source = th->dest;
968 rep.th.doff = arg.iov[0].iov_len / 4;
969 rep.th.seq = htonl(seq);
970 rep.th.ack_seq = htonl(ack);
971 rep.th.ack = 1;
972 rep.th.window = htons(win);
973
974 #ifdef CONFIG_TCP_MD5SIG
975 if (tcp_key_is_md5(key)) {
976 int offset = (tsecr) ? 3 : 0;
977
978 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
979 (TCPOPT_NOP << 16) |
980 (TCPOPT_MD5SIG << 8) |
981 TCPOLEN_MD5SIG);
982 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
983 rep.th.doff = arg.iov[0].iov_len/4;
984
985 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
986 key->md5_key, ip_hdr(skb)->saddr,
987 ip_hdr(skb)->daddr, &rep.th);
988 }
989 #endif
990 #ifdef CONFIG_TCP_AO
991 if (tcp_key_is_ao(key)) {
992 int offset = (tsecr) ? 3 : 0;
993
994 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
995 (tcp_ao_len(key->ao_key) << 16) |
996 (key->ao_key->sndid << 8) |
997 key->rcv_next);
998 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
999 rep.th.doff = arg.iov[0].iov_len / 4;
1000
1001 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1002 key->ao_key, key->traffic_key,
1003 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1004 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1005 &rep.th, key->sne);
1006 }
1007 #endif
1008 arg.flags = reply_flags;
1009 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1010 ip_hdr(skb)->saddr, /* XXX */
1011 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1012 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1013 if (oif)
1014 arg.bound_dev_if = oif;
1015 arg.tos = tos;
1016 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1017 local_bh_disable();
1018 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1019 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1020 sock_net_set(ctl_sk, net);
1021 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1022 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1023 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1024 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1025 transmit_time = tcp_transmit_time(sk);
1026 ip_send_unicast_reply(ctl_sk, sk,
1027 skb, &TCP_SKB_CB(skb)->header.h4.opt,
1028 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1029 &arg, arg.iov[0].iov_len,
1030 transmit_time, txhash);
1031
1032 sock_net_set(ctl_sk, &init_net);
1033 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1034 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1035 local_bh_enable();
1036 }
1037
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)1038 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1039 {
1040 struct inet_timewait_sock *tw = inet_twsk(sk);
1041 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1042 struct tcp_key key = {};
1043 #ifdef CONFIG_TCP_AO
1044 struct tcp_ao_info *ao_info;
1045
1046 if (static_branch_unlikely(&tcp_ao_needed.key)) {
1047 /* FIXME: the segment to-be-acked is not verified yet */
1048 ao_info = rcu_dereference(tcptw->ao_info);
1049 if (ao_info) {
1050 const struct tcp_ao_hdr *aoh;
1051
1052 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1053 inet_twsk_put(tw);
1054 return;
1055 }
1056
1057 if (aoh)
1058 key.ao_key = tcp_ao_established_key(sk, ao_info,
1059 aoh->rnext_keyid, -1);
1060 }
1061 }
1062 if (key.ao_key) {
1063 struct tcp_ao_key *rnext_key;
1064
1065 key.traffic_key = snd_other_key(key.ao_key);
1066 key.sne = READ_ONCE(ao_info->snd_sne);
1067 rnext_key = READ_ONCE(ao_info->rnext_key);
1068 key.rcv_next = rnext_key->rcvid;
1069 key.type = TCP_KEY_AO;
1070 #else
1071 if (0) {
1072 #endif
1073 } else if (static_branch_tcp_md5()) {
1074 key.md5_key = tcp_twsk_md5_key(tcptw);
1075 if (key.md5_key)
1076 key.type = TCP_KEY_MD5;
1077 }
1078
1079 tcp_v4_send_ack(sk, skb,
1080 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1081 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1082 tcp_tw_tsval(tcptw),
1083 READ_ONCE(tcptw->tw_ts_recent),
1084 tw->tw_bound_dev_if, &key,
1085 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1086 tw->tw_tos,
1087 tw->tw_txhash);
1088
1089 inet_twsk_put(tw);
1090 }
1091
1092 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1093 struct request_sock *req)
1094 {
1095 struct tcp_key key = {};
1096
1097 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1098 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1099 */
1100 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1101 tcp_sk(sk)->snd_nxt;
1102
1103 #ifdef CONFIG_TCP_AO
1104 if (static_branch_unlikely(&tcp_ao_needed.key) &&
1105 tcp_rsk_used_ao(req)) {
1106 const union tcp_md5_addr *addr;
1107 const struct tcp_ao_hdr *aoh;
1108 int l3index;
1109
1110 /* Invalid TCP option size or twice included auth */
1111 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1112 return;
1113 if (!aoh)
1114 return;
1115
1116 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1117 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1118 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1119 aoh->rnext_keyid, -1);
1120 if (unlikely(!key.ao_key)) {
1121 /* Send ACK with any matching MKT for the peer */
1122 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1123 /* Matching key disappeared (user removed the key?)
1124 * let the handshake timeout.
1125 */
1126 if (!key.ao_key) {
1127 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1128 addr,
1129 ntohs(tcp_hdr(skb)->source),
1130 &ip_hdr(skb)->daddr,
1131 ntohs(tcp_hdr(skb)->dest));
1132 return;
1133 }
1134 }
1135 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1136 if (!key.traffic_key)
1137 return;
1138
1139 key.type = TCP_KEY_AO;
1140 key.rcv_next = aoh->keyid;
1141 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1142 #else
1143 if (0) {
1144 #endif
1145 } else if (static_branch_tcp_md5()) {
1146 const union tcp_md5_addr *addr;
1147 int l3index;
1148
1149 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1150 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1151 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1152 if (key.md5_key)
1153 key.type = TCP_KEY_MD5;
1154 }
1155
1156 tcp_v4_send_ack(sk, skb, seq,
1157 tcp_rsk(req)->rcv_nxt,
1158 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1159 tcp_rsk_tsval(tcp_rsk(req)),
1160 READ_ONCE(req->ts_recent),
1161 0, &key,
1162 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1163 ip_hdr(skb)->tos,
1164 READ_ONCE(tcp_rsk(req)->txhash));
1165 if (tcp_key_is_ao(&key))
1166 kfree(key.traffic_key);
1167 }
1168
1169 /*
1170 * Send a SYN-ACK after having received a SYN.
1171 * This still operates on a request_sock only, not on a big
1172 * socket.
1173 */
1174 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1175 struct flowi *fl,
1176 struct request_sock *req,
1177 struct tcp_fastopen_cookie *foc,
1178 enum tcp_synack_type synack_type,
1179 struct sk_buff *syn_skb)
1180 {
1181 const struct inet_request_sock *ireq = inet_rsk(req);
1182 struct flowi4 fl4;
1183 int err = -1;
1184 struct sk_buff *skb;
1185 u8 tos;
1186
1187 /* First, grab a route. */
1188 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1189 return -1;
1190
1191 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1192
1193 if (skb) {
1194 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1195
1196 tos = READ_ONCE(inet_sk(sk)->tos);
1197
1198 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1199 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1200 (tos & INET_ECN_MASK);
1201
1202 if (!INET_ECN_is_capable(tos) &&
1203 tcp_bpf_ca_needs_ecn((struct sock *)req))
1204 tos |= INET_ECN_ECT_0;
1205
1206 rcu_read_lock();
1207 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1208 ireq->ir_rmt_addr,
1209 rcu_dereference(ireq->ireq_opt),
1210 tos);
1211 rcu_read_unlock();
1212 err = net_xmit_eval(err);
1213 }
1214
1215 return err;
1216 }
1217
1218 /*
1219 * IPv4 request_sock destructor.
1220 */
1221 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1222 {
1223 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1224 }
1225
1226 #ifdef CONFIG_TCP_MD5SIG
1227 /*
1228 * RFC2385 MD5 checksumming requires a mapping of
1229 * IP address->MD5 Key.
1230 * We need to maintain these in the sk structure.
1231 */
1232
1233 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1234 EXPORT_SYMBOL(tcp_md5_needed);
1235
1236 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1237 {
1238 if (!old)
1239 return true;
1240
1241 /* l3index always overrides non-l3index */
1242 if (old->l3index && new->l3index == 0)
1243 return false;
1244 if (old->l3index == 0 && new->l3index)
1245 return true;
1246
1247 return old->prefixlen < new->prefixlen;
1248 }
1249
1250 /* Find the Key structure for an address. */
1251 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1252 const union tcp_md5_addr *addr,
1253 int family, bool any_l3index)
1254 {
1255 const struct tcp_sock *tp = tcp_sk(sk);
1256 struct tcp_md5sig_key *key;
1257 const struct tcp_md5sig_info *md5sig;
1258 __be32 mask;
1259 struct tcp_md5sig_key *best_match = NULL;
1260 bool match;
1261
1262 /* caller either holds rcu_read_lock() or socket lock */
1263 md5sig = rcu_dereference_check(tp->md5sig_info,
1264 lockdep_sock_is_held(sk));
1265 if (!md5sig)
1266 return NULL;
1267
1268 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1269 lockdep_sock_is_held(sk)) {
1270 if (key->family != family)
1271 continue;
1272 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1273 key->l3index != l3index)
1274 continue;
1275 if (family == AF_INET) {
1276 mask = inet_make_mask(key->prefixlen);
1277 match = (key->addr.a4.s_addr & mask) ==
1278 (addr->a4.s_addr & mask);
1279 #if IS_ENABLED(CONFIG_IPV6)
1280 } else if (family == AF_INET6) {
1281 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1282 key->prefixlen);
1283 #endif
1284 } else {
1285 match = false;
1286 }
1287
1288 if (match && better_md5_match(best_match, key))
1289 best_match = key;
1290 }
1291 return best_match;
1292 }
1293 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1294
1295 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1296 const union tcp_md5_addr *addr,
1297 int family, u8 prefixlen,
1298 int l3index, u8 flags)
1299 {
1300 const struct tcp_sock *tp = tcp_sk(sk);
1301 struct tcp_md5sig_key *key;
1302 unsigned int size = sizeof(struct in_addr);
1303 const struct tcp_md5sig_info *md5sig;
1304
1305 /* caller either holds rcu_read_lock() or socket lock */
1306 md5sig = rcu_dereference_check(tp->md5sig_info,
1307 lockdep_sock_is_held(sk));
1308 if (!md5sig)
1309 return NULL;
1310 #if IS_ENABLED(CONFIG_IPV6)
1311 if (family == AF_INET6)
1312 size = sizeof(struct in6_addr);
1313 #endif
1314 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1315 lockdep_sock_is_held(sk)) {
1316 if (key->family != family)
1317 continue;
1318 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1319 continue;
1320 if (key->l3index != l3index)
1321 continue;
1322 if (!memcmp(&key->addr, addr, size) &&
1323 key->prefixlen == prefixlen)
1324 return key;
1325 }
1326 return NULL;
1327 }
1328
1329 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1330 const struct sock *addr_sk)
1331 {
1332 const union tcp_md5_addr *addr;
1333 int l3index;
1334
1335 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1336 addr_sk->sk_bound_dev_if);
1337 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1338 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1339 }
1340 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1341
1342 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1343 {
1344 struct tcp_sock *tp = tcp_sk(sk);
1345 struct tcp_md5sig_info *md5sig;
1346
1347 md5sig = kmalloc(sizeof(*md5sig), gfp);
1348 if (!md5sig)
1349 return -ENOMEM;
1350
1351 sk_gso_disable(sk);
1352 INIT_HLIST_HEAD(&md5sig->head);
1353 rcu_assign_pointer(tp->md5sig_info, md5sig);
1354 return 0;
1355 }
1356
1357 /* This can be called on a newly created socket, from other files */
1358 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1359 int family, u8 prefixlen, int l3index, u8 flags,
1360 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1361 {
1362 /* Add Key to the list */
1363 struct tcp_md5sig_key *key;
1364 struct tcp_sock *tp = tcp_sk(sk);
1365 struct tcp_md5sig_info *md5sig;
1366
1367 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1368 if (key) {
1369 /* Pre-existing entry - just update that one.
1370 * Note that the key might be used concurrently.
1371 * data_race() is telling kcsan that we do not care of
1372 * key mismatches, since changing MD5 key on live flows
1373 * can lead to packet drops.
1374 */
1375 data_race(memcpy(key->key, newkey, newkeylen));
1376
1377 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1378 * Also note that a reader could catch new key->keylen value
1379 * but old key->key[], this is the reason we use __GFP_ZERO
1380 * at sock_kmalloc() time below these lines.
1381 */
1382 WRITE_ONCE(key->keylen, newkeylen);
1383
1384 return 0;
1385 }
1386
1387 md5sig = rcu_dereference_protected(tp->md5sig_info,
1388 lockdep_sock_is_held(sk));
1389
1390 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1391 if (!key)
1392 return -ENOMEM;
1393
1394 memcpy(key->key, newkey, newkeylen);
1395 key->keylen = newkeylen;
1396 key->family = family;
1397 key->prefixlen = prefixlen;
1398 key->l3index = l3index;
1399 key->flags = flags;
1400 memcpy(&key->addr, addr,
1401 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1402 sizeof(struct in_addr));
1403 hlist_add_head_rcu(&key->node, &md5sig->head);
1404 return 0;
1405 }
1406
1407 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1408 int family, u8 prefixlen, int l3index, u8 flags,
1409 const u8 *newkey, u8 newkeylen)
1410 {
1411 struct tcp_sock *tp = tcp_sk(sk);
1412
1413 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1414 if (tcp_md5_alloc_sigpool())
1415 return -ENOMEM;
1416
1417 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1418 tcp_md5_release_sigpool();
1419 return -ENOMEM;
1420 }
1421
1422 if (!static_branch_inc(&tcp_md5_needed.key)) {
1423 struct tcp_md5sig_info *md5sig;
1424
1425 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1426 rcu_assign_pointer(tp->md5sig_info, NULL);
1427 kfree_rcu(md5sig, rcu);
1428 tcp_md5_release_sigpool();
1429 return -EUSERS;
1430 }
1431 }
1432
1433 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1434 newkey, newkeylen, GFP_KERNEL);
1435 }
1436 EXPORT_SYMBOL(tcp_md5_do_add);
1437
1438 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1439 int family, u8 prefixlen, int l3index,
1440 struct tcp_md5sig_key *key)
1441 {
1442 struct tcp_sock *tp = tcp_sk(sk);
1443
1444 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1445 tcp_md5_add_sigpool();
1446
1447 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1448 tcp_md5_release_sigpool();
1449 return -ENOMEM;
1450 }
1451
1452 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1453 struct tcp_md5sig_info *md5sig;
1454
1455 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1456 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1457 rcu_assign_pointer(tp->md5sig_info, NULL);
1458 kfree_rcu(md5sig, rcu);
1459 tcp_md5_release_sigpool();
1460 return -EUSERS;
1461 }
1462 }
1463
1464 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1465 key->flags, key->key, key->keylen,
1466 sk_gfp_mask(sk, GFP_ATOMIC));
1467 }
1468 EXPORT_SYMBOL(tcp_md5_key_copy);
1469
1470 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1471 u8 prefixlen, int l3index, u8 flags)
1472 {
1473 struct tcp_md5sig_key *key;
1474
1475 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1476 if (!key)
1477 return -ENOENT;
1478 hlist_del_rcu(&key->node);
1479 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1480 kfree_rcu(key, rcu);
1481 return 0;
1482 }
1483 EXPORT_SYMBOL(tcp_md5_do_del);
1484
1485 void tcp_clear_md5_list(struct sock *sk)
1486 {
1487 struct tcp_sock *tp = tcp_sk(sk);
1488 struct tcp_md5sig_key *key;
1489 struct hlist_node *n;
1490 struct tcp_md5sig_info *md5sig;
1491
1492 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1493
1494 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1495 hlist_del_rcu(&key->node);
1496 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1497 kfree_rcu(key, rcu);
1498 }
1499 }
1500
1501 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1502 sockptr_t optval, int optlen)
1503 {
1504 struct tcp_md5sig cmd;
1505 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1506 const union tcp_md5_addr *addr;
1507 u8 prefixlen = 32;
1508 int l3index = 0;
1509 bool l3flag;
1510 u8 flags;
1511
1512 if (optlen < sizeof(cmd))
1513 return -EINVAL;
1514
1515 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1516 return -EFAULT;
1517
1518 if (sin->sin_family != AF_INET)
1519 return -EINVAL;
1520
1521 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1522 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1523
1524 if (optname == TCP_MD5SIG_EXT &&
1525 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1526 prefixlen = cmd.tcpm_prefixlen;
1527 if (prefixlen > 32)
1528 return -EINVAL;
1529 }
1530
1531 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1532 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1533 struct net_device *dev;
1534
1535 rcu_read_lock();
1536 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1537 if (dev && netif_is_l3_master(dev))
1538 l3index = dev->ifindex;
1539
1540 rcu_read_unlock();
1541
1542 /* ok to reference set/not set outside of rcu;
1543 * right now device MUST be an L3 master
1544 */
1545 if (!dev || !l3index)
1546 return -EINVAL;
1547 }
1548
1549 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1550
1551 if (!cmd.tcpm_keylen)
1552 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1553
1554 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1555 return -EINVAL;
1556
1557 /* Don't allow keys for peers that have a matching TCP-AO key.
1558 * See the comment in tcp_ao_add_cmd()
1559 */
1560 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1561 return -EKEYREJECTED;
1562
1563 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1564 cmd.tcpm_key, cmd.tcpm_keylen);
1565 }
1566
1567 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1568 __be32 daddr, __be32 saddr,
1569 const struct tcphdr *th, int nbytes)
1570 {
1571 struct tcp4_pseudohdr *bp;
1572 struct scatterlist sg;
1573 struct tcphdr *_th;
1574
1575 bp = hp->scratch;
1576 bp->saddr = saddr;
1577 bp->daddr = daddr;
1578 bp->pad = 0;
1579 bp->protocol = IPPROTO_TCP;
1580 bp->len = cpu_to_be16(nbytes);
1581
1582 _th = (struct tcphdr *)(bp + 1);
1583 memcpy(_th, th, sizeof(*th));
1584 _th->check = 0;
1585
1586 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1587 ahash_request_set_crypt(hp->req, &sg, NULL,
1588 sizeof(*bp) + sizeof(*th));
1589 return crypto_ahash_update(hp->req);
1590 }
1591
1592 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1593 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1594 {
1595 struct tcp_sigpool hp;
1596
1597 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1598 goto clear_hash_nostart;
1599
1600 if (crypto_ahash_init(hp.req))
1601 goto clear_hash;
1602 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1603 goto clear_hash;
1604 if (tcp_md5_hash_key(&hp, key))
1605 goto clear_hash;
1606 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1607 if (crypto_ahash_final(hp.req))
1608 goto clear_hash;
1609
1610 tcp_sigpool_end(&hp);
1611 return 0;
1612
1613 clear_hash:
1614 tcp_sigpool_end(&hp);
1615 clear_hash_nostart:
1616 memset(md5_hash, 0, 16);
1617 return 1;
1618 }
1619
1620 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1621 const struct sock *sk,
1622 const struct sk_buff *skb)
1623 {
1624 const struct tcphdr *th = tcp_hdr(skb);
1625 struct tcp_sigpool hp;
1626 __be32 saddr, daddr;
1627
1628 if (sk) { /* valid for establish/request sockets */
1629 saddr = sk->sk_rcv_saddr;
1630 daddr = sk->sk_daddr;
1631 } else {
1632 const struct iphdr *iph = ip_hdr(skb);
1633 saddr = iph->saddr;
1634 daddr = iph->daddr;
1635 }
1636
1637 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1638 goto clear_hash_nostart;
1639
1640 if (crypto_ahash_init(hp.req))
1641 goto clear_hash;
1642
1643 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1644 goto clear_hash;
1645 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1646 goto clear_hash;
1647 if (tcp_md5_hash_key(&hp, key))
1648 goto clear_hash;
1649 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1650 if (crypto_ahash_final(hp.req))
1651 goto clear_hash;
1652
1653 tcp_sigpool_end(&hp);
1654 return 0;
1655
1656 clear_hash:
1657 tcp_sigpool_end(&hp);
1658 clear_hash_nostart:
1659 memset(md5_hash, 0, 16);
1660 return 1;
1661 }
1662 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1663
1664 #endif
1665
1666 static void tcp_v4_init_req(struct request_sock *req,
1667 const struct sock *sk_listener,
1668 struct sk_buff *skb)
1669 {
1670 struct inet_request_sock *ireq = inet_rsk(req);
1671 struct net *net = sock_net(sk_listener);
1672
1673 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1674 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1675 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1676 }
1677
1678 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1679 struct sk_buff *skb,
1680 struct flowi *fl,
1681 struct request_sock *req,
1682 u32 tw_isn)
1683 {
1684 tcp_v4_init_req(req, sk, skb);
1685
1686 if (security_inet_conn_request(sk, skb, req))
1687 return NULL;
1688
1689 return inet_csk_route_req(sk, &fl->u.ip4, req);
1690 }
1691
1692 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1693 .family = PF_INET,
1694 .obj_size = sizeof(struct tcp_request_sock),
1695 .rtx_syn_ack = tcp_rtx_synack,
1696 .send_ack = tcp_v4_reqsk_send_ack,
1697 .destructor = tcp_v4_reqsk_destructor,
1698 .send_reset = tcp_v4_send_reset,
1699 .syn_ack_timeout = tcp_syn_ack_timeout,
1700 };
1701
1702 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1703 .mss_clamp = TCP_MSS_DEFAULT,
1704 #ifdef CONFIG_TCP_MD5SIG
1705 .req_md5_lookup = tcp_v4_md5_lookup,
1706 .calc_md5_hash = tcp_v4_md5_hash_skb,
1707 #endif
1708 #ifdef CONFIG_TCP_AO
1709 .ao_lookup = tcp_v4_ao_lookup_rsk,
1710 .ao_calc_key = tcp_v4_ao_calc_key_rsk,
1711 .ao_synack_hash = tcp_v4_ao_synack_hash,
1712 #endif
1713 #ifdef CONFIG_SYN_COOKIES
1714 .cookie_init_seq = cookie_v4_init_sequence,
1715 #endif
1716 .route_req = tcp_v4_route_req,
1717 .init_seq = tcp_v4_init_seq,
1718 .init_ts_off = tcp_v4_init_ts_off,
1719 .send_synack = tcp_v4_send_synack,
1720 };
1721
1722 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1723 {
1724 /* Never answer to SYNs send to broadcast or multicast */
1725 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1726 goto drop;
1727
1728 return tcp_conn_request(&tcp_request_sock_ops,
1729 &tcp_request_sock_ipv4_ops, sk, skb);
1730
1731 drop:
1732 tcp_listendrop(sk);
1733 return 0;
1734 }
1735 EXPORT_SYMBOL(tcp_v4_conn_request);
1736
1737
1738 /*
1739 * The three way handshake has completed - we got a valid synack -
1740 * now create the new socket.
1741 */
1742 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1743 struct request_sock *req,
1744 struct dst_entry *dst,
1745 struct request_sock *req_unhash,
1746 bool *own_req)
1747 {
1748 struct inet_request_sock *ireq;
1749 bool found_dup_sk = false;
1750 struct inet_sock *newinet;
1751 struct tcp_sock *newtp;
1752 struct sock *newsk;
1753 #ifdef CONFIG_TCP_MD5SIG
1754 const union tcp_md5_addr *addr;
1755 struct tcp_md5sig_key *key;
1756 int l3index;
1757 #endif
1758 struct ip_options_rcu *inet_opt;
1759
1760 if (sk_acceptq_is_full(sk))
1761 goto exit_overflow;
1762
1763 newsk = tcp_create_openreq_child(sk, req, skb);
1764 if (!newsk)
1765 goto exit_nonewsk;
1766
1767 newsk->sk_gso_type = SKB_GSO_TCPV4;
1768 inet_sk_rx_dst_set(newsk, skb);
1769
1770 newtp = tcp_sk(newsk);
1771 newinet = inet_sk(newsk);
1772 ireq = inet_rsk(req);
1773 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1774 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1775 newsk->sk_bound_dev_if = ireq->ir_iif;
1776 newinet->inet_saddr = ireq->ir_loc_addr;
1777 inet_opt = rcu_dereference(ireq->ireq_opt);
1778 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1779 newinet->mc_index = inet_iif(skb);
1780 newinet->mc_ttl = ip_hdr(skb)->ttl;
1781 newinet->rcv_tos = ip_hdr(skb)->tos;
1782 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1783 if (inet_opt)
1784 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1785 atomic_set(&newinet->inet_id, get_random_u16());
1786
1787 /* Set ToS of the new socket based upon the value of incoming SYN.
1788 * ECT bits are set later in tcp_init_transfer().
1789 */
1790 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1791 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1792
1793 if (!dst) {
1794 dst = inet_csk_route_child_sock(sk, newsk, req);
1795 if (!dst)
1796 goto put_and_exit;
1797 } else {
1798 /* syncookie case : see end of cookie_v4_check() */
1799 }
1800 sk_setup_caps(newsk, dst);
1801
1802 tcp_ca_openreq_child(newsk, dst);
1803
1804 tcp_sync_mss(newsk, dst_mtu(dst));
1805 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1806
1807 tcp_initialize_rcv_mss(newsk);
1808
1809 #ifdef CONFIG_TCP_MD5SIG
1810 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1811 /* Copy over the MD5 key from the original socket */
1812 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1813 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1814 if (key && !tcp_rsk_used_ao(req)) {
1815 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1816 goto put_and_exit;
1817 sk_gso_disable(newsk);
1818 }
1819 #endif
1820 #ifdef CONFIG_TCP_AO
1821 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1822 goto put_and_exit; /* OOM, release back memory */
1823 #endif
1824
1825 if (__inet_inherit_port(sk, newsk) < 0)
1826 goto put_and_exit;
1827 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1828 &found_dup_sk);
1829 if (likely(*own_req)) {
1830 tcp_move_syn(newtp, req);
1831 ireq->ireq_opt = NULL;
1832 } else {
1833 newinet->inet_opt = NULL;
1834
1835 if (!req_unhash && found_dup_sk) {
1836 /* This code path should only be executed in the
1837 * syncookie case only
1838 */
1839 bh_unlock_sock(newsk);
1840 sock_put(newsk);
1841 newsk = NULL;
1842 }
1843 }
1844 return newsk;
1845
1846 exit_overflow:
1847 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1848 exit_nonewsk:
1849 dst_release(dst);
1850 exit:
1851 tcp_listendrop(sk);
1852 return NULL;
1853 put_and_exit:
1854 newinet->inet_opt = NULL;
1855 inet_csk_prepare_forced_close(newsk);
1856 tcp_done(newsk);
1857 goto exit;
1858 }
1859 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1860
1861 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1862 {
1863 #ifdef CONFIG_SYN_COOKIES
1864 const struct tcphdr *th = tcp_hdr(skb);
1865
1866 if (!th->syn)
1867 sk = cookie_v4_check(sk, skb);
1868 #endif
1869 return sk;
1870 }
1871
1872 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1873 struct tcphdr *th, u32 *cookie)
1874 {
1875 u16 mss = 0;
1876 #ifdef CONFIG_SYN_COOKIES
1877 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1878 &tcp_request_sock_ipv4_ops, sk, th);
1879 if (mss) {
1880 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1881 tcp_synq_overflow(sk);
1882 }
1883 #endif
1884 return mss;
1885 }
1886
1887 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1888 u32));
1889 /* The socket must have it's spinlock held when we get
1890 * here, unless it is a TCP_LISTEN socket.
1891 *
1892 * We have a potential double-lock case here, so even when
1893 * doing backlog processing we use the BH locking scheme.
1894 * This is because we cannot sleep with the original spinlock
1895 * held.
1896 */
1897 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1898 {
1899 enum skb_drop_reason reason;
1900 struct sock *rsk;
1901
1902 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1903 struct dst_entry *dst;
1904
1905 dst = rcu_dereference_protected(sk->sk_rx_dst,
1906 lockdep_sock_is_held(sk));
1907
1908 sock_rps_save_rxhash(sk, skb);
1909 sk_mark_napi_id(sk, skb);
1910 if (dst) {
1911 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1912 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1913 dst, 0)) {
1914 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1915 dst_release(dst);
1916 }
1917 }
1918 tcp_rcv_established(sk, skb);
1919 return 0;
1920 }
1921
1922 if (tcp_checksum_complete(skb))
1923 goto csum_err;
1924
1925 if (sk->sk_state == TCP_LISTEN) {
1926 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1927
1928 if (!nsk)
1929 return 0;
1930 if (nsk != sk) {
1931 reason = tcp_child_process(sk, nsk, skb);
1932 if (reason) {
1933 rsk = nsk;
1934 goto reset;
1935 }
1936 return 0;
1937 }
1938 } else
1939 sock_rps_save_rxhash(sk, skb);
1940
1941 reason = tcp_rcv_state_process(sk, skb);
1942 if (reason) {
1943 rsk = sk;
1944 goto reset;
1945 }
1946 return 0;
1947
1948 reset:
1949 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1950 discard:
1951 sk_skb_reason_drop(sk, skb, reason);
1952 /* Be careful here. If this function gets more complicated and
1953 * gcc suffers from register pressure on the x86, sk (in %ebx)
1954 * might be destroyed here. This current version compiles correctly,
1955 * but you have been warned.
1956 */
1957 return 0;
1958
1959 csum_err:
1960 reason = SKB_DROP_REASON_TCP_CSUM;
1961 trace_tcp_bad_csum(skb);
1962 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1963 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1964 goto discard;
1965 }
1966 EXPORT_SYMBOL(tcp_v4_do_rcv);
1967
1968 int tcp_v4_early_demux(struct sk_buff *skb)
1969 {
1970 struct net *net = dev_net(skb->dev);
1971 const struct iphdr *iph;
1972 const struct tcphdr *th;
1973 struct sock *sk;
1974
1975 if (skb->pkt_type != PACKET_HOST)
1976 return 0;
1977
1978 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1979 return 0;
1980
1981 iph = ip_hdr(skb);
1982 th = tcp_hdr(skb);
1983
1984 if (th->doff < sizeof(struct tcphdr) / 4)
1985 return 0;
1986
1987 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1988 iph->saddr, th->source,
1989 iph->daddr, ntohs(th->dest),
1990 skb->skb_iif, inet_sdif(skb));
1991 if (sk) {
1992 skb->sk = sk;
1993 skb->destructor = sock_edemux;
1994 if (sk_fullsock(sk)) {
1995 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1996
1997 if (dst)
1998 dst = dst_check(dst, 0);
1999 if (dst &&
2000 sk->sk_rx_dst_ifindex == skb->skb_iif)
2001 skb_dst_set_noref(skb, dst);
2002 }
2003 }
2004 return 0;
2005 }
2006
2007 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2008 enum skb_drop_reason *reason)
2009 {
2010 u32 tail_gso_size, tail_gso_segs;
2011 struct skb_shared_info *shinfo;
2012 const struct tcphdr *th;
2013 struct tcphdr *thtail;
2014 struct sk_buff *tail;
2015 unsigned int hdrlen;
2016 bool fragstolen;
2017 u32 gso_segs;
2018 u32 gso_size;
2019 u64 limit;
2020 int delta;
2021
2022 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2023 * we can fix skb->truesize to its real value to avoid future drops.
2024 * This is valid because skb is not yet charged to the socket.
2025 * It has been noticed pure SACK packets were sometimes dropped
2026 * (if cooked by drivers without copybreak feature).
2027 */
2028 skb_condense(skb);
2029
2030 tcp_cleanup_skb(skb);
2031
2032 if (unlikely(tcp_checksum_complete(skb))) {
2033 bh_unlock_sock(sk);
2034 trace_tcp_bad_csum(skb);
2035 *reason = SKB_DROP_REASON_TCP_CSUM;
2036 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2037 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2038 return true;
2039 }
2040
2041 /* Attempt coalescing to last skb in backlog, even if we are
2042 * above the limits.
2043 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2044 */
2045 th = (const struct tcphdr *)skb->data;
2046 hdrlen = th->doff * 4;
2047
2048 tail = sk->sk_backlog.tail;
2049 if (!tail)
2050 goto no_coalesce;
2051 thtail = (struct tcphdr *)tail->data;
2052
2053 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2054 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2055 ((TCP_SKB_CB(tail)->tcp_flags |
2056 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2057 !((TCP_SKB_CB(tail)->tcp_flags &
2058 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2059 ((TCP_SKB_CB(tail)->tcp_flags ^
2060 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2061 !tcp_skb_can_collapse_rx(tail, skb) ||
2062 thtail->doff != th->doff ||
2063 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2064 goto no_coalesce;
2065
2066 __skb_pull(skb, hdrlen);
2067
2068 shinfo = skb_shinfo(skb);
2069 gso_size = shinfo->gso_size ?: skb->len;
2070 gso_segs = shinfo->gso_segs ?: 1;
2071
2072 shinfo = skb_shinfo(tail);
2073 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2074 tail_gso_segs = shinfo->gso_segs ?: 1;
2075
2076 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2077 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2078
2079 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2080 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2081 thtail->window = th->window;
2082 }
2083
2084 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2085 * thtail->fin, so that the fast path in tcp_rcv_established()
2086 * is not entered if we append a packet with a FIN.
2087 * SYN, RST, URG are not present.
2088 * ACK is set on both packets.
2089 * PSH : we do not really care in TCP stack,
2090 * at least for 'GRO' packets.
2091 */
2092 thtail->fin |= th->fin;
2093 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2094
2095 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2096 TCP_SKB_CB(tail)->has_rxtstamp = true;
2097 tail->tstamp = skb->tstamp;
2098 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2099 }
2100
2101 /* Not as strict as GRO. We only need to carry mss max value */
2102 shinfo->gso_size = max(gso_size, tail_gso_size);
2103 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2104
2105 sk->sk_backlog.len += delta;
2106 __NET_INC_STATS(sock_net(sk),
2107 LINUX_MIB_TCPBACKLOGCOALESCE);
2108 kfree_skb_partial(skb, fragstolen);
2109 return false;
2110 }
2111 __skb_push(skb, hdrlen);
2112
2113 no_coalesce:
2114 /* sk->sk_backlog.len is reset only at the end of __release_sock().
2115 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2116 * sk_rcvbuf in normal conditions.
2117 */
2118 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2119
2120 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2121
2122 /* Only socket owner can try to collapse/prune rx queues
2123 * to reduce memory overhead, so add a little headroom here.
2124 * Few sockets backlog are possibly concurrently non empty.
2125 */
2126 limit += 64 * 1024;
2127
2128 limit = min_t(u64, limit, UINT_MAX);
2129
2130 if (unlikely(sk_add_backlog(sk, skb, limit))) {
2131 bh_unlock_sock(sk);
2132 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2133 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2134 return true;
2135 }
2136 return false;
2137 }
2138 EXPORT_SYMBOL(tcp_add_backlog);
2139
2140 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2141 {
2142 struct tcphdr *th = (struct tcphdr *)skb->data;
2143
2144 return sk_filter_trim_cap(sk, skb, th->doff * 4);
2145 }
2146 EXPORT_SYMBOL(tcp_filter);
2147
2148 static void tcp_v4_restore_cb(struct sk_buff *skb)
2149 {
2150 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2151 sizeof(struct inet_skb_parm));
2152 }
2153
2154 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2155 const struct tcphdr *th)
2156 {
2157 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2158 * barrier() makes sure compiler wont play fool^Waliasing games.
2159 */
2160 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2161 sizeof(struct inet_skb_parm));
2162 barrier();
2163
2164 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2165 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2166 skb->len - th->doff * 4);
2167 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2168 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2169 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2170 TCP_SKB_CB(skb)->sacked = 0;
2171 TCP_SKB_CB(skb)->has_rxtstamp =
2172 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2173 }
2174
2175 /*
2176 * From tcp_input.c
2177 */
2178
2179 int tcp_v4_rcv(struct sk_buff *skb)
2180 {
2181 struct net *net = dev_net(skb->dev);
2182 enum skb_drop_reason drop_reason;
2183 int sdif = inet_sdif(skb);
2184 int dif = inet_iif(skb);
2185 const struct iphdr *iph;
2186 const struct tcphdr *th;
2187 struct sock *sk = NULL;
2188 bool refcounted;
2189 int ret;
2190 u32 isn;
2191
2192 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2193 if (skb->pkt_type != PACKET_HOST)
2194 goto discard_it;
2195
2196 /* Count it even if it's bad */
2197 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2198
2199 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2200 goto discard_it;
2201
2202 th = (const struct tcphdr *)skb->data;
2203
2204 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2205 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2206 goto bad_packet;
2207 }
2208 if (!pskb_may_pull(skb, th->doff * 4))
2209 goto discard_it;
2210
2211 /* An explanation is required here, I think.
2212 * Packet length and doff are validated by header prediction,
2213 * provided case of th->doff==0 is eliminated.
2214 * So, we defer the checks. */
2215
2216 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2217 goto csum_error;
2218
2219 th = (const struct tcphdr *)skb->data;
2220 iph = ip_hdr(skb);
2221 lookup:
2222 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2223 skb, __tcp_hdrlen(th), th->source,
2224 th->dest, sdif, &refcounted);
2225 if (!sk)
2226 goto no_tcp_socket;
2227
2228 if (sk->sk_state == TCP_TIME_WAIT)
2229 goto do_time_wait;
2230
2231 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2232 struct request_sock *req = inet_reqsk(sk);
2233 bool req_stolen = false;
2234 struct sock *nsk;
2235
2236 sk = req->rsk_listener;
2237 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2238 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2239 else
2240 drop_reason = tcp_inbound_hash(sk, req, skb,
2241 &iph->saddr, &iph->daddr,
2242 AF_INET, dif, sdif);
2243 if (unlikely(drop_reason)) {
2244 sk_drops_add(sk, skb);
2245 reqsk_put(req);
2246 goto discard_it;
2247 }
2248 if (tcp_checksum_complete(skb)) {
2249 reqsk_put(req);
2250 goto csum_error;
2251 }
2252 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2253 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2254 if (!nsk) {
2255 inet_csk_reqsk_queue_drop_and_put(sk, req);
2256 goto lookup;
2257 }
2258 sk = nsk;
2259 /* reuseport_migrate_sock() has already held one sk_refcnt
2260 * before returning.
2261 */
2262 } else {
2263 /* We own a reference on the listener, increase it again
2264 * as we might lose it too soon.
2265 */
2266 sock_hold(sk);
2267 }
2268 refcounted = true;
2269 nsk = NULL;
2270 if (!tcp_filter(sk, skb)) {
2271 th = (const struct tcphdr *)skb->data;
2272 iph = ip_hdr(skb);
2273 tcp_v4_fill_cb(skb, iph, th);
2274 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2275 } else {
2276 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2277 }
2278 if (!nsk) {
2279 reqsk_put(req);
2280 if (req_stolen) {
2281 /* Another cpu got exclusive access to req
2282 * and created a full blown socket.
2283 * Try to feed this packet to this socket
2284 * instead of discarding it.
2285 */
2286 tcp_v4_restore_cb(skb);
2287 sock_put(sk);
2288 goto lookup;
2289 }
2290 goto discard_and_relse;
2291 }
2292 nf_reset_ct(skb);
2293 if (nsk == sk) {
2294 reqsk_put(req);
2295 tcp_v4_restore_cb(skb);
2296 } else {
2297 drop_reason = tcp_child_process(sk, nsk, skb);
2298 if (drop_reason) {
2299 enum sk_rst_reason rst_reason;
2300
2301 rst_reason = sk_rst_convert_drop_reason(drop_reason);
2302 tcp_v4_send_reset(nsk, skb, rst_reason);
2303 goto discard_and_relse;
2304 }
2305 sock_put(sk);
2306 return 0;
2307 }
2308 }
2309
2310 process:
2311 if (static_branch_unlikely(&ip4_min_ttl)) {
2312 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2313 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2314 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2315 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2316 goto discard_and_relse;
2317 }
2318 }
2319
2320 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2321 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2322 goto discard_and_relse;
2323 }
2324
2325 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2326 AF_INET, dif, sdif);
2327 if (drop_reason)
2328 goto discard_and_relse;
2329
2330 nf_reset_ct(skb);
2331
2332 if (tcp_filter(sk, skb)) {
2333 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2334 goto discard_and_relse;
2335 }
2336 th = (const struct tcphdr *)skb->data;
2337 iph = ip_hdr(skb);
2338 tcp_v4_fill_cb(skb, iph, th);
2339
2340 skb->dev = NULL;
2341
2342 if (sk->sk_state == TCP_LISTEN) {
2343 ret = tcp_v4_do_rcv(sk, skb);
2344 goto put_and_return;
2345 }
2346
2347 sk_incoming_cpu_update(sk);
2348
2349 bh_lock_sock_nested(sk);
2350 tcp_segs_in(tcp_sk(sk), skb);
2351 ret = 0;
2352 if (!sock_owned_by_user(sk)) {
2353 ret = tcp_v4_do_rcv(sk, skb);
2354 } else {
2355 if (tcp_add_backlog(sk, skb, &drop_reason))
2356 goto discard_and_relse;
2357 }
2358 bh_unlock_sock(sk);
2359
2360 put_and_return:
2361 if (refcounted)
2362 sock_put(sk);
2363
2364 return ret;
2365
2366 no_tcp_socket:
2367 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2368 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2369 goto discard_it;
2370
2371 tcp_v4_fill_cb(skb, iph, th);
2372
2373 if (tcp_checksum_complete(skb)) {
2374 csum_error:
2375 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2376 trace_tcp_bad_csum(skb);
2377 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2378 bad_packet:
2379 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2380 } else {
2381 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2382 }
2383
2384 discard_it:
2385 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2386 /* Discard frame. */
2387 sk_skb_reason_drop(sk, skb, drop_reason);
2388 return 0;
2389
2390 discard_and_relse:
2391 sk_drops_add(sk, skb);
2392 if (refcounted)
2393 sock_put(sk);
2394 goto discard_it;
2395
2396 do_time_wait:
2397 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2398 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2399 inet_twsk_put(inet_twsk(sk));
2400 goto discard_it;
2401 }
2402
2403 tcp_v4_fill_cb(skb, iph, th);
2404
2405 if (tcp_checksum_complete(skb)) {
2406 inet_twsk_put(inet_twsk(sk));
2407 goto csum_error;
2408 }
2409 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2410 case TCP_TW_SYN: {
2411 struct sock *sk2 = inet_lookup_listener(net,
2412 net->ipv4.tcp_death_row.hashinfo,
2413 skb, __tcp_hdrlen(th),
2414 iph->saddr, th->source,
2415 iph->daddr, th->dest,
2416 inet_iif(skb),
2417 sdif);
2418 if (sk2) {
2419 inet_twsk_deschedule_put(inet_twsk(sk));
2420 sk = sk2;
2421 tcp_v4_restore_cb(skb);
2422 refcounted = false;
2423 __this_cpu_write(tcp_tw_isn, isn);
2424 goto process;
2425 }
2426 }
2427 /* to ACK */
2428 fallthrough;
2429 case TCP_TW_ACK:
2430 tcp_v4_timewait_ack(sk, skb);
2431 break;
2432 case TCP_TW_RST:
2433 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2434 inet_twsk_deschedule_put(inet_twsk(sk));
2435 goto discard_it;
2436 case TCP_TW_SUCCESS:;
2437 }
2438 goto discard_it;
2439 }
2440
2441 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2442 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2443 .twsk_destructor= tcp_twsk_destructor,
2444 };
2445
2446 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2447 {
2448 struct dst_entry *dst = skb_dst(skb);
2449
2450 if (dst && dst_hold_safe(dst)) {
2451 rcu_assign_pointer(sk->sk_rx_dst, dst);
2452 sk->sk_rx_dst_ifindex = skb->skb_iif;
2453 }
2454 }
2455 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2456
2457 const struct inet_connection_sock_af_ops ipv4_specific = {
2458 .queue_xmit = ip_queue_xmit,
2459 .send_check = tcp_v4_send_check,
2460 .rebuild_header = inet_sk_rebuild_header,
2461 .sk_rx_dst_set = inet_sk_rx_dst_set,
2462 .conn_request = tcp_v4_conn_request,
2463 .syn_recv_sock = tcp_v4_syn_recv_sock,
2464 .net_header_len = sizeof(struct iphdr),
2465 .setsockopt = ip_setsockopt,
2466 .getsockopt = ip_getsockopt,
2467 .addr2sockaddr = inet_csk_addr2sockaddr,
2468 .sockaddr_len = sizeof(struct sockaddr_in),
2469 .mtu_reduced = tcp_v4_mtu_reduced,
2470 };
2471 EXPORT_SYMBOL(ipv4_specific);
2472
2473 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2474 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2475 #ifdef CONFIG_TCP_MD5SIG
2476 .md5_lookup = tcp_v4_md5_lookup,
2477 .calc_md5_hash = tcp_v4_md5_hash_skb,
2478 .md5_parse = tcp_v4_parse_md5_keys,
2479 #endif
2480 #ifdef CONFIG_TCP_AO
2481 .ao_lookup = tcp_v4_ao_lookup,
2482 .calc_ao_hash = tcp_v4_ao_hash_skb,
2483 .ao_parse = tcp_v4_parse_ao,
2484 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2485 #endif
2486 };
2487 #endif
2488
2489 /* NOTE: A lot of things set to zero explicitly by call to
2490 * sk_alloc() so need not be done here.
2491 */
2492 static int tcp_v4_init_sock(struct sock *sk)
2493 {
2494 struct inet_connection_sock *icsk = inet_csk(sk);
2495
2496 tcp_init_sock(sk);
2497
2498 icsk->icsk_af_ops = &ipv4_specific;
2499
2500 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2501 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2502 #endif
2503
2504 return 0;
2505 }
2506
2507 #ifdef CONFIG_TCP_MD5SIG
2508 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2509 {
2510 struct tcp_md5sig_info *md5sig;
2511
2512 md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2513 kfree(md5sig);
2514 static_branch_slow_dec_deferred(&tcp_md5_needed);
2515 tcp_md5_release_sigpool();
2516 }
2517 #endif
2518
2519 static void tcp_release_user_frags(struct sock *sk)
2520 {
2521 #ifdef CONFIG_PAGE_POOL
2522 unsigned long index;
2523 void *netmem;
2524
2525 xa_for_each(&sk->sk_user_frags, index, netmem)
2526 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2527 #endif
2528 }
2529
2530 void tcp_v4_destroy_sock(struct sock *sk)
2531 {
2532 struct tcp_sock *tp = tcp_sk(sk);
2533
2534 tcp_release_user_frags(sk);
2535
2536 xa_destroy(&sk->sk_user_frags);
2537
2538 trace_tcp_destroy_sock(sk);
2539
2540 tcp_clear_xmit_timers(sk);
2541
2542 tcp_cleanup_congestion_control(sk);
2543
2544 tcp_cleanup_ulp(sk);
2545
2546 /* Cleanup up the write buffer. */
2547 tcp_write_queue_purge(sk);
2548
2549 /* Check if we want to disable active TFO */
2550 tcp_fastopen_active_disable_ofo_check(sk);
2551
2552 /* Cleans up our, hopefully empty, out_of_order_queue. */
2553 skb_rbtree_purge(&tp->out_of_order_queue);
2554
2555 #ifdef CONFIG_TCP_MD5SIG
2556 /* Clean up the MD5 key list, if any */
2557 if (tp->md5sig_info) {
2558 struct tcp_md5sig_info *md5sig;
2559
2560 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2561 tcp_clear_md5_list(sk);
2562 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2563 rcu_assign_pointer(tp->md5sig_info, NULL);
2564 }
2565 #endif
2566 tcp_ao_destroy_sock(sk, false);
2567
2568 /* Clean up a referenced TCP bind bucket. */
2569 if (inet_csk(sk)->icsk_bind_hash)
2570 inet_put_port(sk);
2571
2572 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2573
2574 /* If socket is aborted during connect operation */
2575 tcp_free_fastopen_req(tp);
2576 tcp_fastopen_destroy_cipher(sk);
2577 tcp_saved_syn_free(tp);
2578
2579 sk_sockets_allocated_dec(sk);
2580 }
2581 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2582
2583 #ifdef CONFIG_PROC_FS
2584 /* Proc filesystem TCP sock list dumping. */
2585
2586 static unsigned short seq_file_family(const struct seq_file *seq);
2587
2588 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2589 {
2590 unsigned short family = seq_file_family(seq);
2591
2592 /* AF_UNSPEC is used as a match all */
2593 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2594 net_eq(sock_net(sk), seq_file_net(seq)));
2595 }
2596
2597 /* Find a non empty bucket (starting from st->bucket)
2598 * and return the first sk from it.
2599 */
2600 static void *listening_get_first(struct seq_file *seq)
2601 {
2602 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2603 struct tcp_iter_state *st = seq->private;
2604
2605 st->offset = 0;
2606 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2607 struct inet_listen_hashbucket *ilb2;
2608 struct hlist_nulls_node *node;
2609 struct sock *sk;
2610
2611 ilb2 = &hinfo->lhash2[st->bucket];
2612 if (hlist_nulls_empty(&ilb2->nulls_head))
2613 continue;
2614
2615 spin_lock(&ilb2->lock);
2616 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2617 if (seq_sk_match(seq, sk))
2618 return sk;
2619 }
2620 spin_unlock(&ilb2->lock);
2621 }
2622
2623 return NULL;
2624 }
2625
2626 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2627 * If "cur" is the last one in the st->bucket,
2628 * call listening_get_first() to return the first sk of the next
2629 * non empty bucket.
2630 */
2631 static void *listening_get_next(struct seq_file *seq, void *cur)
2632 {
2633 struct tcp_iter_state *st = seq->private;
2634 struct inet_listen_hashbucket *ilb2;
2635 struct hlist_nulls_node *node;
2636 struct inet_hashinfo *hinfo;
2637 struct sock *sk = cur;
2638
2639 ++st->num;
2640 ++st->offset;
2641
2642 sk = sk_nulls_next(sk);
2643 sk_nulls_for_each_from(sk, node) {
2644 if (seq_sk_match(seq, sk))
2645 return sk;
2646 }
2647
2648 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2649 ilb2 = &hinfo->lhash2[st->bucket];
2650 spin_unlock(&ilb2->lock);
2651 ++st->bucket;
2652 return listening_get_first(seq);
2653 }
2654
2655 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2656 {
2657 struct tcp_iter_state *st = seq->private;
2658 void *rc;
2659
2660 st->bucket = 0;
2661 st->offset = 0;
2662 rc = listening_get_first(seq);
2663
2664 while (rc && *pos) {
2665 rc = listening_get_next(seq, rc);
2666 --*pos;
2667 }
2668 return rc;
2669 }
2670
2671 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2672 const struct tcp_iter_state *st)
2673 {
2674 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2675 }
2676
2677 /*
2678 * Get first established socket starting from bucket given in st->bucket.
2679 * If st->bucket is zero, the very first socket in the hash is returned.
2680 */
2681 static void *established_get_first(struct seq_file *seq)
2682 {
2683 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2684 struct tcp_iter_state *st = seq->private;
2685
2686 st->offset = 0;
2687 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2688 struct sock *sk;
2689 struct hlist_nulls_node *node;
2690 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2691
2692 cond_resched();
2693
2694 /* Lockless fast path for the common case of empty buckets */
2695 if (empty_bucket(hinfo, st))
2696 continue;
2697
2698 spin_lock_bh(lock);
2699 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2700 if (seq_sk_match(seq, sk))
2701 return sk;
2702 }
2703 spin_unlock_bh(lock);
2704 }
2705
2706 return NULL;
2707 }
2708
2709 static void *established_get_next(struct seq_file *seq, void *cur)
2710 {
2711 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2712 struct tcp_iter_state *st = seq->private;
2713 struct hlist_nulls_node *node;
2714 struct sock *sk = cur;
2715
2716 ++st->num;
2717 ++st->offset;
2718
2719 sk = sk_nulls_next(sk);
2720
2721 sk_nulls_for_each_from(sk, node) {
2722 if (seq_sk_match(seq, sk))
2723 return sk;
2724 }
2725
2726 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2727 ++st->bucket;
2728 return established_get_first(seq);
2729 }
2730
2731 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2732 {
2733 struct tcp_iter_state *st = seq->private;
2734 void *rc;
2735
2736 st->bucket = 0;
2737 rc = established_get_first(seq);
2738
2739 while (rc && pos) {
2740 rc = established_get_next(seq, rc);
2741 --pos;
2742 }
2743 return rc;
2744 }
2745
2746 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2747 {
2748 void *rc;
2749 struct tcp_iter_state *st = seq->private;
2750
2751 st->state = TCP_SEQ_STATE_LISTENING;
2752 rc = listening_get_idx(seq, &pos);
2753
2754 if (!rc) {
2755 st->state = TCP_SEQ_STATE_ESTABLISHED;
2756 rc = established_get_idx(seq, pos);
2757 }
2758
2759 return rc;
2760 }
2761
2762 static void *tcp_seek_last_pos(struct seq_file *seq)
2763 {
2764 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2765 struct tcp_iter_state *st = seq->private;
2766 int bucket = st->bucket;
2767 int offset = st->offset;
2768 int orig_num = st->num;
2769 void *rc = NULL;
2770
2771 switch (st->state) {
2772 case TCP_SEQ_STATE_LISTENING:
2773 if (st->bucket > hinfo->lhash2_mask)
2774 break;
2775 rc = listening_get_first(seq);
2776 while (offset-- && rc && bucket == st->bucket)
2777 rc = listening_get_next(seq, rc);
2778 if (rc)
2779 break;
2780 st->bucket = 0;
2781 st->state = TCP_SEQ_STATE_ESTABLISHED;
2782 fallthrough;
2783 case TCP_SEQ_STATE_ESTABLISHED:
2784 if (st->bucket > hinfo->ehash_mask)
2785 break;
2786 rc = established_get_first(seq);
2787 while (offset-- && rc && bucket == st->bucket)
2788 rc = established_get_next(seq, rc);
2789 }
2790
2791 st->num = orig_num;
2792
2793 return rc;
2794 }
2795
2796 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2797 {
2798 struct tcp_iter_state *st = seq->private;
2799 void *rc;
2800
2801 if (*pos && *pos == st->last_pos) {
2802 rc = tcp_seek_last_pos(seq);
2803 if (rc)
2804 goto out;
2805 }
2806
2807 st->state = TCP_SEQ_STATE_LISTENING;
2808 st->num = 0;
2809 st->bucket = 0;
2810 st->offset = 0;
2811 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2812
2813 out:
2814 st->last_pos = *pos;
2815 return rc;
2816 }
2817 EXPORT_SYMBOL(tcp_seq_start);
2818
2819 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2820 {
2821 struct tcp_iter_state *st = seq->private;
2822 void *rc = NULL;
2823
2824 if (v == SEQ_START_TOKEN) {
2825 rc = tcp_get_idx(seq, 0);
2826 goto out;
2827 }
2828
2829 switch (st->state) {
2830 case TCP_SEQ_STATE_LISTENING:
2831 rc = listening_get_next(seq, v);
2832 if (!rc) {
2833 st->state = TCP_SEQ_STATE_ESTABLISHED;
2834 st->bucket = 0;
2835 st->offset = 0;
2836 rc = established_get_first(seq);
2837 }
2838 break;
2839 case TCP_SEQ_STATE_ESTABLISHED:
2840 rc = established_get_next(seq, v);
2841 break;
2842 }
2843 out:
2844 ++*pos;
2845 st->last_pos = *pos;
2846 return rc;
2847 }
2848 EXPORT_SYMBOL(tcp_seq_next);
2849
2850 void tcp_seq_stop(struct seq_file *seq, void *v)
2851 {
2852 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2853 struct tcp_iter_state *st = seq->private;
2854
2855 switch (st->state) {
2856 case TCP_SEQ_STATE_LISTENING:
2857 if (v != SEQ_START_TOKEN)
2858 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2859 break;
2860 case TCP_SEQ_STATE_ESTABLISHED:
2861 if (v)
2862 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2863 break;
2864 }
2865 }
2866 EXPORT_SYMBOL(tcp_seq_stop);
2867
2868 static void get_openreq4(const struct request_sock *req,
2869 struct seq_file *f, int i)
2870 {
2871 const struct inet_request_sock *ireq = inet_rsk(req);
2872 long delta = req->rsk_timer.expires - jiffies;
2873
2874 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2875 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2876 i,
2877 ireq->ir_loc_addr,
2878 ireq->ir_num,
2879 ireq->ir_rmt_addr,
2880 ntohs(ireq->ir_rmt_port),
2881 TCP_SYN_RECV,
2882 0, 0, /* could print option size, but that is af dependent. */
2883 1, /* timers active (only the expire timer) */
2884 jiffies_delta_to_clock_t(delta),
2885 req->num_timeout,
2886 from_kuid_munged(seq_user_ns(f),
2887 sock_i_uid(req->rsk_listener)),
2888 0, /* non standard timer */
2889 0, /* open_requests have no inode */
2890 0,
2891 req);
2892 }
2893
2894 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2895 {
2896 int timer_active;
2897 unsigned long timer_expires;
2898 const struct tcp_sock *tp = tcp_sk(sk);
2899 const struct inet_connection_sock *icsk = inet_csk(sk);
2900 const struct inet_sock *inet = inet_sk(sk);
2901 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2902 __be32 dest = inet->inet_daddr;
2903 __be32 src = inet->inet_rcv_saddr;
2904 __u16 destp = ntohs(inet->inet_dport);
2905 __u16 srcp = ntohs(inet->inet_sport);
2906 u8 icsk_pending;
2907 int rx_queue;
2908 int state;
2909
2910 icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2911 if (icsk_pending == ICSK_TIME_RETRANS ||
2912 icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2913 icsk_pending == ICSK_TIME_LOSS_PROBE) {
2914 timer_active = 1;
2915 timer_expires = icsk->icsk_timeout;
2916 } else if (icsk_pending == ICSK_TIME_PROBE0) {
2917 timer_active = 4;
2918 timer_expires = icsk->icsk_timeout;
2919 } else if (timer_pending(&sk->sk_timer)) {
2920 timer_active = 2;
2921 timer_expires = sk->sk_timer.expires;
2922 } else {
2923 timer_active = 0;
2924 timer_expires = jiffies;
2925 }
2926
2927 state = inet_sk_state_load(sk);
2928 if (state == TCP_LISTEN)
2929 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2930 else
2931 /* Because we don't lock the socket,
2932 * we might find a transient negative value.
2933 */
2934 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2935 READ_ONCE(tp->copied_seq), 0);
2936
2937 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2938 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2939 i, src, srcp, dest, destp, state,
2940 READ_ONCE(tp->write_seq) - tp->snd_una,
2941 rx_queue,
2942 timer_active,
2943 jiffies_delta_to_clock_t(timer_expires - jiffies),
2944 icsk->icsk_retransmits,
2945 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2946 icsk->icsk_probes_out,
2947 sock_i_ino(sk),
2948 refcount_read(&sk->sk_refcnt), sk,
2949 jiffies_to_clock_t(icsk->icsk_rto),
2950 jiffies_to_clock_t(icsk->icsk_ack.ato),
2951 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2952 tcp_snd_cwnd(tp),
2953 state == TCP_LISTEN ?
2954 fastopenq->max_qlen :
2955 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2956 }
2957
2958 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2959 struct seq_file *f, int i)
2960 {
2961 long delta = tw->tw_timer.expires - jiffies;
2962 __be32 dest, src;
2963 __u16 destp, srcp;
2964
2965 dest = tw->tw_daddr;
2966 src = tw->tw_rcv_saddr;
2967 destp = ntohs(tw->tw_dport);
2968 srcp = ntohs(tw->tw_sport);
2969
2970 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2971 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2972 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2973 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2974 refcount_read(&tw->tw_refcnt), tw);
2975 }
2976
2977 #define TMPSZ 150
2978
2979 static int tcp4_seq_show(struct seq_file *seq, void *v)
2980 {
2981 struct tcp_iter_state *st;
2982 struct sock *sk = v;
2983
2984 seq_setwidth(seq, TMPSZ - 1);
2985 if (v == SEQ_START_TOKEN) {
2986 seq_puts(seq, " sl local_address rem_address st tx_queue "
2987 "rx_queue tr tm->when retrnsmt uid timeout "
2988 "inode");
2989 goto out;
2990 }
2991 st = seq->private;
2992
2993 if (sk->sk_state == TCP_TIME_WAIT)
2994 get_timewait4_sock(v, seq, st->num);
2995 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2996 get_openreq4(v, seq, st->num);
2997 else
2998 get_tcp4_sock(v, seq, st->num);
2999 out:
3000 seq_pad(seq, '\n');
3001 return 0;
3002 }
3003
3004 #ifdef CONFIG_BPF_SYSCALL
3005 struct bpf_tcp_iter_state {
3006 struct tcp_iter_state state;
3007 unsigned int cur_sk;
3008 unsigned int end_sk;
3009 unsigned int max_sk;
3010 struct sock **batch;
3011 bool st_bucket_done;
3012 };
3013
3014 struct bpf_iter__tcp {
3015 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3016 __bpf_md_ptr(struct sock_common *, sk_common);
3017 uid_t uid __aligned(8);
3018 };
3019
3020 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3021 struct sock_common *sk_common, uid_t uid)
3022 {
3023 struct bpf_iter__tcp ctx;
3024
3025 meta->seq_num--; /* skip SEQ_START_TOKEN */
3026 ctx.meta = meta;
3027 ctx.sk_common = sk_common;
3028 ctx.uid = uid;
3029 return bpf_iter_run_prog(prog, &ctx);
3030 }
3031
3032 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3033 {
3034 while (iter->cur_sk < iter->end_sk)
3035 sock_gen_put(iter->batch[iter->cur_sk++]);
3036 }
3037
3038 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3039 unsigned int new_batch_sz)
3040 {
3041 struct sock **new_batch;
3042
3043 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3044 GFP_USER | __GFP_NOWARN);
3045 if (!new_batch)
3046 return -ENOMEM;
3047
3048 bpf_iter_tcp_put_batch(iter);
3049 kvfree(iter->batch);
3050 iter->batch = new_batch;
3051 iter->max_sk = new_batch_sz;
3052
3053 return 0;
3054 }
3055
3056 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3057 struct sock *start_sk)
3058 {
3059 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3060 struct bpf_tcp_iter_state *iter = seq->private;
3061 struct tcp_iter_state *st = &iter->state;
3062 struct hlist_nulls_node *node;
3063 unsigned int expected = 1;
3064 struct sock *sk;
3065
3066 sock_hold(start_sk);
3067 iter->batch[iter->end_sk++] = start_sk;
3068
3069 sk = sk_nulls_next(start_sk);
3070 sk_nulls_for_each_from(sk, node) {
3071 if (seq_sk_match(seq, sk)) {
3072 if (iter->end_sk < iter->max_sk) {
3073 sock_hold(sk);
3074 iter->batch[iter->end_sk++] = sk;
3075 }
3076 expected++;
3077 }
3078 }
3079 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3080
3081 return expected;
3082 }
3083
3084 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3085 struct sock *start_sk)
3086 {
3087 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3088 struct bpf_tcp_iter_state *iter = seq->private;
3089 struct tcp_iter_state *st = &iter->state;
3090 struct hlist_nulls_node *node;
3091 unsigned int expected = 1;
3092 struct sock *sk;
3093
3094 sock_hold(start_sk);
3095 iter->batch[iter->end_sk++] = start_sk;
3096
3097 sk = sk_nulls_next(start_sk);
3098 sk_nulls_for_each_from(sk, node) {
3099 if (seq_sk_match(seq, sk)) {
3100 if (iter->end_sk < iter->max_sk) {
3101 sock_hold(sk);
3102 iter->batch[iter->end_sk++] = sk;
3103 }
3104 expected++;
3105 }
3106 }
3107 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3108
3109 return expected;
3110 }
3111
3112 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3113 {
3114 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3115 struct bpf_tcp_iter_state *iter = seq->private;
3116 struct tcp_iter_state *st = &iter->state;
3117 unsigned int expected;
3118 bool resized = false;
3119 struct sock *sk;
3120
3121 /* The st->bucket is done. Directly advance to the next
3122 * bucket instead of having the tcp_seek_last_pos() to skip
3123 * one by one in the current bucket and eventually find out
3124 * it has to advance to the next bucket.
3125 */
3126 if (iter->st_bucket_done) {
3127 st->offset = 0;
3128 st->bucket++;
3129 if (st->state == TCP_SEQ_STATE_LISTENING &&
3130 st->bucket > hinfo->lhash2_mask) {
3131 st->state = TCP_SEQ_STATE_ESTABLISHED;
3132 st->bucket = 0;
3133 }
3134 }
3135
3136 again:
3137 /* Get a new batch */
3138 iter->cur_sk = 0;
3139 iter->end_sk = 0;
3140 iter->st_bucket_done = false;
3141
3142 sk = tcp_seek_last_pos(seq);
3143 if (!sk)
3144 return NULL; /* Done */
3145
3146 if (st->state == TCP_SEQ_STATE_LISTENING)
3147 expected = bpf_iter_tcp_listening_batch(seq, sk);
3148 else
3149 expected = bpf_iter_tcp_established_batch(seq, sk);
3150
3151 if (iter->end_sk == expected) {
3152 iter->st_bucket_done = true;
3153 return sk;
3154 }
3155
3156 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3157 resized = true;
3158 goto again;
3159 }
3160
3161 return sk;
3162 }
3163
3164 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3165 {
3166 /* bpf iter does not support lseek, so it always
3167 * continue from where it was stop()-ped.
3168 */
3169 if (*pos)
3170 return bpf_iter_tcp_batch(seq);
3171
3172 return SEQ_START_TOKEN;
3173 }
3174
3175 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3176 {
3177 struct bpf_tcp_iter_state *iter = seq->private;
3178 struct tcp_iter_state *st = &iter->state;
3179 struct sock *sk;
3180
3181 /* Whenever seq_next() is called, the iter->cur_sk is
3182 * done with seq_show(), so advance to the next sk in
3183 * the batch.
3184 */
3185 if (iter->cur_sk < iter->end_sk) {
3186 /* Keeping st->num consistent in tcp_iter_state.
3187 * bpf_iter_tcp does not use st->num.
3188 * meta.seq_num is used instead.
3189 */
3190 st->num++;
3191 /* Move st->offset to the next sk in the bucket such that
3192 * the future start() will resume at st->offset in
3193 * st->bucket. See tcp_seek_last_pos().
3194 */
3195 st->offset++;
3196 sock_gen_put(iter->batch[iter->cur_sk++]);
3197 }
3198
3199 if (iter->cur_sk < iter->end_sk)
3200 sk = iter->batch[iter->cur_sk];
3201 else
3202 sk = bpf_iter_tcp_batch(seq);
3203
3204 ++*pos;
3205 /* Keeping st->last_pos consistent in tcp_iter_state.
3206 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3207 */
3208 st->last_pos = *pos;
3209 return sk;
3210 }
3211
3212 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3213 {
3214 struct bpf_iter_meta meta;
3215 struct bpf_prog *prog;
3216 struct sock *sk = v;
3217 uid_t uid;
3218 int ret;
3219
3220 if (v == SEQ_START_TOKEN)
3221 return 0;
3222
3223 if (sk_fullsock(sk))
3224 lock_sock(sk);
3225
3226 if (unlikely(sk_unhashed(sk))) {
3227 ret = SEQ_SKIP;
3228 goto unlock;
3229 }
3230
3231 if (sk->sk_state == TCP_TIME_WAIT) {
3232 uid = 0;
3233 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3234 const struct request_sock *req = v;
3235
3236 uid = from_kuid_munged(seq_user_ns(seq),
3237 sock_i_uid(req->rsk_listener));
3238 } else {
3239 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3240 }
3241
3242 meta.seq = seq;
3243 prog = bpf_iter_get_info(&meta, false);
3244 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3245
3246 unlock:
3247 if (sk_fullsock(sk))
3248 release_sock(sk);
3249 return ret;
3250
3251 }
3252
3253 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3254 {
3255 struct bpf_tcp_iter_state *iter = seq->private;
3256 struct bpf_iter_meta meta;
3257 struct bpf_prog *prog;
3258
3259 if (!v) {
3260 meta.seq = seq;
3261 prog = bpf_iter_get_info(&meta, true);
3262 if (prog)
3263 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3264 }
3265
3266 if (iter->cur_sk < iter->end_sk) {
3267 bpf_iter_tcp_put_batch(iter);
3268 iter->st_bucket_done = false;
3269 }
3270 }
3271
3272 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3273 .show = bpf_iter_tcp_seq_show,
3274 .start = bpf_iter_tcp_seq_start,
3275 .next = bpf_iter_tcp_seq_next,
3276 .stop = bpf_iter_tcp_seq_stop,
3277 };
3278 #endif
3279 static unsigned short seq_file_family(const struct seq_file *seq)
3280 {
3281 const struct tcp_seq_afinfo *afinfo;
3282
3283 #ifdef CONFIG_BPF_SYSCALL
3284 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3285 if (seq->op == &bpf_iter_tcp_seq_ops)
3286 return AF_UNSPEC;
3287 #endif
3288
3289 /* Iterated from proc fs */
3290 afinfo = pde_data(file_inode(seq->file));
3291 return afinfo->family;
3292 }
3293
3294 static const struct seq_operations tcp4_seq_ops = {
3295 .show = tcp4_seq_show,
3296 .start = tcp_seq_start,
3297 .next = tcp_seq_next,
3298 .stop = tcp_seq_stop,
3299 };
3300
3301 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3302 .family = AF_INET,
3303 };
3304
3305 static int __net_init tcp4_proc_init_net(struct net *net)
3306 {
3307 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3308 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3309 return -ENOMEM;
3310 return 0;
3311 }
3312
3313 static void __net_exit tcp4_proc_exit_net(struct net *net)
3314 {
3315 remove_proc_entry("tcp", net->proc_net);
3316 }
3317
3318 static struct pernet_operations tcp4_net_ops = {
3319 .init = tcp4_proc_init_net,
3320 .exit = tcp4_proc_exit_net,
3321 };
3322
3323 int __init tcp4_proc_init(void)
3324 {
3325 return register_pernet_subsys(&tcp4_net_ops);
3326 }
3327
3328 void tcp4_proc_exit(void)
3329 {
3330 unregister_pernet_subsys(&tcp4_net_ops);
3331 }
3332 #endif /* CONFIG_PROC_FS */
3333
3334 /* @wake is one when sk_stream_write_space() calls us.
3335 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3336 * This mimics the strategy used in sock_def_write_space().
3337 */
3338 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3339 {
3340 const struct tcp_sock *tp = tcp_sk(sk);
3341 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3342 READ_ONCE(tp->snd_nxt);
3343
3344 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3345 }
3346 EXPORT_SYMBOL(tcp_stream_memory_free);
3347
3348 struct proto tcp_prot = {
3349 .name = "TCP",
3350 .owner = THIS_MODULE,
3351 .close = tcp_close,
3352 .pre_connect = tcp_v4_pre_connect,
3353 .connect = tcp_v4_connect,
3354 .disconnect = tcp_disconnect,
3355 .accept = inet_csk_accept,
3356 .ioctl = tcp_ioctl,
3357 .init = tcp_v4_init_sock,
3358 .destroy = tcp_v4_destroy_sock,
3359 .shutdown = tcp_shutdown,
3360 .setsockopt = tcp_setsockopt,
3361 .getsockopt = tcp_getsockopt,
3362 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3363 .keepalive = tcp_set_keepalive,
3364 .recvmsg = tcp_recvmsg,
3365 .sendmsg = tcp_sendmsg,
3366 .splice_eof = tcp_splice_eof,
3367 .backlog_rcv = tcp_v4_do_rcv,
3368 .release_cb = tcp_release_cb,
3369 .hash = inet_hash,
3370 .unhash = inet_unhash,
3371 .get_port = inet_csk_get_port,
3372 .put_port = inet_put_port,
3373 #ifdef CONFIG_BPF_SYSCALL
3374 .psock_update_sk_prot = tcp_bpf_update_proto,
3375 #endif
3376 .enter_memory_pressure = tcp_enter_memory_pressure,
3377 .leave_memory_pressure = tcp_leave_memory_pressure,
3378 .stream_memory_free = tcp_stream_memory_free,
3379 .sockets_allocated = &tcp_sockets_allocated,
3380 .orphan_count = &tcp_orphan_count,
3381
3382 .memory_allocated = &tcp_memory_allocated,
3383 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3384
3385 .memory_pressure = &tcp_memory_pressure,
3386 .sysctl_mem = sysctl_tcp_mem,
3387 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3388 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3389 .max_header = MAX_TCP_HEADER,
3390 .obj_size = sizeof(struct tcp_sock),
3391 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3392 .twsk_prot = &tcp_timewait_sock_ops,
3393 .rsk_prot = &tcp_request_sock_ops,
3394 .h.hashinfo = NULL,
3395 .no_autobind = true,
3396 .diag_destroy = tcp_abort,
3397 };
3398 EXPORT_SYMBOL(tcp_prot);
3399
3400 static void __net_exit tcp_sk_exit(struct net *net)
3401 {
3402 if (net->ipv4.tcp_congestion_control)
3403 bpf_module_put(net->ipv4.tcp_congestion_control,
3404 net->ipv4.tcp_congestion_control->owner);
3405 }
3406
3407 static void __net_init tcp_set_hashinfo(struct net *net)
3408 {
3409 struct inet_hashinfo *hinfo;
3410 unsigned int ehash_entries;
3411 struct net *old_net;
3412
3413 if (net_eq(net, &init_net))
3414 goto fallback;
3415
3416 old_net = current->nsproxy->net_ns;
3417 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3418 if (!ehash_entries)
3419 goto fallback;
3420
3421 ehash_entries = roundup_pow_of_two(ehash_entries);
3422 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3423 if (!hinfo) {
3424 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3425 "for a netns, fallback to the global one\n",
3426 ehash_entries);
3427 fallback:
3428 hinfo = &tcp_hashinfo;
3429 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3430 }
3431
3432 net->ipv4.tcp_death_row.hashinfo = hinfo;
3433 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3434 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3435 }
3436
3437 static int __net_init tcp_sk_init(struct net *net)
3438 {
3439 net->ipv4.sysctl_tcp_ecn = 2;
3440 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3441
3442 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3443 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3444 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3445 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3446 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3447
3448 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3449 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3450 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3451
3452 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3453 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3454 net->ipv4.sysctl_tcp_syncookies = 1;
3455 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3456 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3457 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3458 net->ipv4.sysctl_tcp_orphan_retries = 0;
3459 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3460 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3461 net->ipv4.sysctl_tcp_tw_reuse = 2;
3462 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3463 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3464
3465 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3466 tcp_set_hashinfo(net);
3467
3468 net->ipv4.sysctl_tcp_sack = 1;
3469 net->ipv4.sysctl_tcp_window_scaling = 1;
3470 net->ipv4.sysctl_tcp_timestamps = 1;
3471 net->ipv4.sysctl_tcp_early_retrans = 3;
3472 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3473 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3474 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3475 net->ipv4.sysctl_tcp_max_reordering = 300;
3476 net->ipv4.sysctl_tcp_dsack = 1;
3477 net->ipv4.sysctl_tcp_app_win = 31;
3478 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3479 net->ipv4.sysctl_tcp_frto = 2;
3480 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3481 /* This limits the percentage of the congestion window which we
3482 * will allow a single TSO frame to consume. Building TSO frames
3483 * which are too large can cause TCP streams to be bursty.
3484 */
3485 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3486 /* Default TSQ limit of 16 TSO segments */
3487 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3488
3489 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3490 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3491
3492 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3493 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3494 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3495 net->ipv4.sysctl_tcp_autocorking = 1;
3496 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3497 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3498 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3499 if (net != &init_net) {
3500 memcpy(net->ipv4.sysctl_tcp_rmem,
3501 init_net.ipv4.sysctl_tcp_rmem,
3502 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3503 memcpy(net->ipv4.sysctl_tcp_wmem,
3504 init_net.ipv4.sysctl_tcp_wmem,
3505 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3506 }
3507 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3508 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3509 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3510 net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3511 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3512 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3513 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3514
3515 /* Set default values for PLB */
3516 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3517 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3518 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3519 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3520 /* Default congestion threshold for PLB to mark a round is 50% */
3521 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3522
3523 /* Reno is always built in */
3524 if (!net_eq(net, &init_net) &&
3525 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3526 init_net.ipv4.tcp_congestion_control->owner))
3527 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3528 else
3529 net->ipv4.tcp_congestion_control = &tcp_reno;
3530
3531 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3532 net->ipv4.sysctl_tcp_shrink_window = 0;
3533
3534 net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3535 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3536
3537 return 0;
3538 }
3539
3540 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3541 {
3542 struct net *net;
3543
3544 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3545 * and failed setup_net error unwinding path are serialized.
3546 *
3547 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3548 * net_exit_list, the thread that dismantles a particular twsk must
3549 * do so without other thread progressing to refcount_dec_and_test() of
3550 * tcp_death_row.tw_refcount.
3551 */
3552 mutex_lock(&tcp_exit_batch_mutex);
3553
3554 tcp_twsk_purge(net_exit_list);
3555
3556 list_for_each_entry(net, net_exit_list, exit_list) {
3557 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3558 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3559 tcp_fastopen_ctx_destroy(net);
3560 }
3561
3562 mutex_unlock(&tcp_exit_batch_mutex);
3563 }
3564
3565 static struct pernet_operations __net_initdata tcp_sk_ops = {
3566 .init = tcp_sk_init,
3567 .exit = tcp_sk_exit,
3568 .exit_batch = tcp_sk_exit_batch,
3569 };
3570
3571 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3572 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3573 struct sock_common *sk_common, uid_t uid)
3574
3575 #define INIT_BATCH_SZ 16
3576
3577 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3578 {
3579 struct bpf_tcp_iter_state *iter = priv_data;
3580 int err;
3581
3582 err = bpf_iter_init_seq_net(priv_data, aux);
3583 if (err)
3584 return err;
3585
3586 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3587 if (err) {
3588 bpf_iter_fini_seq_net(priv_data);
3589 return err;
3590 }
3591
3592 return 0;
3593 }
3594
3595 static void bpf_iter_fini_tcp(void *priv_data)
3596 {
3597 struct bpf_tcp_iter_state *iter = priv_data;
3598
3599 bpf_iter_fini_seq_net(priv_data);
3600 kvfree(iter->batch);
3601 }
3602
3603 static const struct bpf_iter_seq_info tcp_seq_info = {
3604 .seq_ops = &bpf_iter_tcp_seq_ops,
3605 .init_seq_private = bpf_iter_init_tcp,
3606 .fini_seq_private = bpf_iter_fini_tcp,
3607 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3608 };
3609
3610 static const struct bpf_func_proto *
3611 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3612 const struct bpf_prog *prog)
3613 {
3614 switch (func_id) {
3615 case BPF_FUNC_setsockopt:
3616 return &bpf_sk_setsockopt_proto;
3617 case BPF_FUNC_getsockopt:
3618 return &bpf_sk_getsockopt_proto;
3619 default:
3620 return NULL;
3621 }
3622 }
3623
3624 static struct bpf_iter_reg tcp_reg_info = {
3625 .target = "tcp",
3626 .ctx_arg_info_size = 1,
3627 .ctx_arg_info = {
3628 { offsetof(struct bpf_iter__tcp, sk_common),
3629 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3630 },
3631 .get_func_proto = bpf_iter_tcp_get_func_proto,
3632 .seq_info = &tcp_seq_info,
3633 };
3634
3635 static void __init bpf_iter_register(void)
3636 {
3637 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3638 if (bpf_iter_reg_target(&tcp_reg_info))
3639 pr_warn("Warning: could not register bpf iterator tcp\n");
3640 }
3641
3642 #endif
3643
3644 void __init tcp_v4_init(void)
3645 {
3646 int cpu, res;
3647
3648 for_each_possible_cpu(cpu) {
3649 struct sock *sk;
3650
3651 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3652 IPPROTO_TCP, &init_net);
3653 if (res)
3654 panic("Failed to create the TCP control socket.\n");
3655 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3656
3657 /* Please enforce IP_DF and IPID==0 for RST and
3658 * ACK sent in SYN-RECV and TIME-WAIT state.
3659 */
3660 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3661
3662 sk->sk_clockid = CLOCK_MONOTONIC;
3663
3664 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3665 }
3666 if (register_pernet_subsys(&tcp_sk_ops))
3667 panic("Failed to create the TCP control socket.\n");
3668
3669 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3670 bpf_iter_register();
3671 #endif
3672 }
3673