1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73 #include <net/rstreason.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/inetdevice.h>
81 #include <linux/btf_ids.h>
82 #include <linux/skbuff_ref.h>
83
84 #include <crypto/hash.h>
85 #include <linux/scatterlist.h>
86
87 #include <trace/events/tcp.h>
88
89 #ifdef CONFIG_TCP_MD5SIG
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
91 __be32 daddr, __be32 saddr, const struct tcphdr *th);
92 #endif
93
94 struct inet_hashinfo tcp_hashinfo;
95 EXPORT_SYMBOL(tcp_hashinfo);
96
97 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98 .bh_lock = INIT_LOCAL_LOCK(bh_lock),
99 };
100
101 static DEFINE_MUTEX(tcp_exit_batch_mutex);
102
tcp_v4_init_seq(const struct sk_buff * skb)103 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
104 {
105 return secure_tcp_seq(ip_hdr(skb)->daddr,
106 ip_hdr(skb)->saddr,
107 tcp_hdr(skb)->dest,
108 tcp_hdr(skb)->source);
109 }
110
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)111 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
112 {
113 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
114 }
115
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 {
118 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
119 const struct inet_timewait_sock *tw = inet_twsk(sktw);
120 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
121 struct tcp_sock *tp = tcp_sk(sk);
122 int ts_recent_stamp;
123
124 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
125 reuse = 0;
126
127 if (reuse == 2) {
128 /* Still does not detect *everything* that goes through
129 * lo, since we require a loopback src or dst address
130 * or direct binding to 'lo' interface.
131 */
132 bool loopback = false;
133 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
134 loopback = true;
135 #if IS_ENABLED(CONFIG_IPV6)
136 if (tw->tw_family == AF_INET6) {
137 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
138 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
139 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
140 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
141 loopback = true;
142 } else
143 #endif
144 {
145 if (ipv4_is_loopback(tw->tw_daddr) ||
146 ipv4_is_loopback(tw->tw_rcv_saddr))
147 loopback = true;
148 }
149 if (!loopback)
150 reuse = 0;
151 }
152
153 /* With PAWS, it is safe from the viewpoint
154 of data integrity. Even without PAWS it is safe provided sequence
155 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
156
157 Actually, the idea is close to VJ's one, only timestamp cache is
158 held not per host, but per port pair and TW bucket is used as state
159 holder.
160
161 If TW bucket has been already destroyed we fall back to VJ's scheme
162 and use initial timestamp retrieved from peer table.
163 */
164 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
165 if (ts_recent_stamp &&
166 (!twp || (reuse && time_after32(ktime_get_seconds(),
167 ts_recent_stamp)))) {
168 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
169 * and releasing the bucket lock.
170 */
171 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
172 return 0;
173
174 /* In case of repair and re-using TIME-WAIT sockets we still
175 * want to be sure that it is safe as above but honor the
176 * sequence numbers and time stamps set as part of the repair
177 * process.
178 *
179 * Without this check re-using a TIME-WAIT socket with TCP
180 * repair would accumulate a -1 on the repair assigned
181 * sequence number. The first time it is reused the sequence
182 * is -1, the second time -2, etc. This fixes that issue
183 * without appearing to create any others.
184 */
185 if (likely(!tp->repair)) {
186 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
187
188 if (!seq)
189 seq = 1;
190 WRITE_ONCE(tp->write_seq, seq);
191 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
192 tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
193 }
194
195 return 1;
196 }
197
198 return 0;
199 }
200 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
201
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)202 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
203 int addr_len)
204 {
205 /* This check is replicated from tcp_v4_connect() and intended to
206 * prevent BPF program called below from accessing bytes that are out
207 * of the bound specified by user in addr_len.
208 */
209 if (addr_len < sizeof(struct sockaddr_in))
210 return -EINVAL;
211
212 sock_owned_by_me(sk);
213
214 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
215 }
216
217 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)218 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
219 {
220 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
221 struct inet_timewait_death_row *tcp_death_row;
222 struct inet_sock *inet = inet_sk(sk);
223 struct tcp_sock *tp = tcp_sk(sk);
224 struct ip_options_rcu *inet_opt;
225 struct net *net = sock_net(sk);
226 __be16 orig_sport, orig_dport;
227 __be32 daddr, nexthop;
228 struct flowi4 *fl4;
229 struct rtable *rt;
230 int err;
231
232 if (addr_len < sizeof(struct sockaddr_in))
233 return -EINVAL;
234
235 if (usin->sin_family != AF_INET)
236 return -EAFNOSUPPORT;
237
238 nexthop = daddr = usin->sin_addr.s_addr;
239 inet_opt = rcu_dereference_protected(inet->inet_opt,
240 lockdep_sock_is_held(sk));
241 if (inet_opt && inet_opt->opt.srr) {
242 if (!daddr)
243 return -EINVAL;
244 nexthop = inet_opt->opt.faddr;
245 }
246
247 orig_sport = inet->inet_sport;
248 orig_dport = usin->sin_port;
249 fl4 = &inet->cork.fl.u.ip4;
250 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
251 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
252 orig_dport, sk);
253 if (IS_ERR(rt)) {
254 err = PTR_ERR(rt);
255 if (err == -ENETUNREACH)
256 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
257 return err;
258 }
259
260 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
261 ip_rt_put(rt);
262 return -ENETUNREACH;
263 }
264
265 if (!inet_opt || !inet_opt->opt.srr)
266 daddr = fl4->daddr;
267
268 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
269
270 if (!inet->inet_saddr) {
271 err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
272 if (err) {
273 ip_rt_put(rt);
274 return err;
275 }
276 } else {
277 sk_rcv_saddr_set(sk, inet->inet_saddr);
278 }
279
280 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
281 /* Reset inherited state */
282 tp->rx_opt.ts_recent = 0;
283 tp->rx_opt.ts_recent_stamp = 0;
284 if (likely(!tp->repair))
285 WRITE_ONCE(tp->write_seq, 0);
286 }
287
288 inet->inet_dport = usin->sin_port;
289 sk_daddr_set(sk, daddr);
290
291 inet_csk(sk)->icsk_ext_hdr_len = 0;
292 if (inet_opt)
293 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
294
295 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
296
297 /* Socket identity is still unknown (sport may be zero).
298 * However we set state to SYN-SENT and not releasing socket
299 * lock select source port, enter ourselves into the hash tables and
300 * complete initialization after this.
301 */
302 tcp_set_state(sk, TCP_SYN_SENT);
303 err = inet_hash_connect(tcp_death_row, sk);
304 if (err)
305 goto failure;
306
307 sk_set_txhash(sk);
308
309 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
310 inet->inet_sport, inet->inet_dport, sk);
311 if (IS_ERR(rt)) {
312 err = PTR_ERR(rt);
313 rt = NULL;
314 goto failure;
315 }
316 tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
317 /* OK, now commit destination to socket. */
318 sk->sk_gso_type = SKB_GSO_TCPV4;
319 sk_setup_caps(sk, &rt->dst);
320 rt = NULL;
321
322 if (likely(!tp->repair)) {
323 if (!tp->write_seq)
324 WRITE_ONCE(tp->write_seq,
325 secure_tcp_seq(inet->inet_saddr,
326 inet->inet_daddr,
327 inet->inet_sport,
328 usin->sin_port));
329 WRITE_ONCE(tp->tsoffset,
330 secure_tcp_ts_off(net, inet->inet_saddr,
331 inet->inet_daddr));
332 }
333
334 atomic_set(&inet->inet_id, get_random_u16());
335
336 if (tcp_fastopen_defer_connect(sk, &err))
337 return err;
338 if (err)
339 goto failure;
340
341 err = tcp_connect(sk);
342
343 if (err)
344 goto failure;
345
346 return 0;
347
348 failure:
349 /*
350 * This unhashes the socket and releases the local port,
351 * if necessary.
352 */
353 tcp_set_state(sk, TCP_CLOSE);
354 inet_bhash2_reset_saddr(sk);
355 ip_rt_put(rt);
356 sk->sk_route_caps = 0;
357 inet->inet_dport = 0;
358 return err;
359 }
360 EXPORT_SYMBOL(tcp_v4_connect);
361
362 /*
363 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
364 * It can be called through tcp_release_cb() if socket was owned by user
365 * at the time tcp_v4_err() was called to handle ICMP message.
366 */
tcp_v4_mtu_reduced(struct sock * sk)367 void tcp_v4_mtu_reduced(struct sock *sk)
368 {
369 struct inet_sock *inet = inet_sk(sk);
370 struct dst_entry *dst;
371 u32 mtu;
372
373 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
374 return;
375 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
376 dst = inet_csk_update_pmtu(sk, mtu);
377 if (!dst)
378 return;
379
380 /* Something is about to be wrong... Remember soft error
381 * for the case, if this connection will not able to recover.
382 */
383 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
384 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
385
386 mtu = dst_mtu(dst);
387
388 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
389 ip_sk_accept_pmtu(sk) &&
390 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
391 tcp_sync_mss(sk, mtu);
392
393 /* Resend the TCP packet because it's
394 * clear that the old packet has been
395 * dropped. This is the new "fast" path mtu
396 * discovery.
397 */
398 tcp_simple_retransmit(sk);
399 } /* else let the usual retransmit timer handle it */
400 }
401 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
402
do_redirect(struct sk_buff * skb,struct sock * sk)403 static void do_redirect(struct sk_buff *skb, struct sock *sk)
404 {
405 struct dst_entry *dst = __sk_dst_check(sk, 0);
406
407 if (dst)
408 dst->ops->redirect(dst, sk, skb);
409 }
410
411
412 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)413 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
414 {
415 struct request_sock *req = inet_reqsk(sk);
416 struct net *net = sock_net(sk);
417
418 /* ICMPs are not backlogged, hence we cannot get
419 * an established socket here.
420 */
421 if (seq != tcp_rsk(req)->snt_isn) {
422 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
423 } else if (abort) {
424 /*
425 * Still in SYN_RECV, just remove it silently.
426 * There is no good way to pass the error to the newly
427 * created socket, and POSIX does not want network
428 * errors returned from accept().
429 */
430 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
431 tcp_listendrop(req->rsk_listener);
432 }
433 reqsk_put(req);
434 }
435 EXPORT_SYMBOL(tcp_req_err);
436
437 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)438 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
439 {
440 struct inet_connection_sock *icsk = inet_csk(sk);
441 struct tcp_sock *tp = tcp_sk(sk);
442 struct sk_buff *skb;
443 s32 remaining;
444 u32 delta_us;
445
446 if (sock_owned_by_user(sk))
447 return;
448
449 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
450 !icsk->icsk_backoff)
451 return;
452
453 skb = tcp_rtx_queue_head(sk);
454 if (WARN_ON_ONCE(!skb))
455 return;
456
457 icsk->icsk_backoff--;
458 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
459 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
460
461 tcp_mstamp_refresh(tp);
462 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
463 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
464
465 if (remaining > 0) {
466 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
467 remaining, TCP_RTO_MAX);
468 } else {
469 /* RTO revert clocked out retransmission.
470 * Will retransmit now.
471 */
472 tcp_retransmit_timer(sk);
473 }
474 }
475 EXPORT_SYMBOL(tcp_ld_RTO_revert);
476
477 /*
478 * This routine is called by the ICMP module when it gets some
479 * sort of error condition. If err < 0 then the socket should
480 * be closed and the error returned to the user. If err > 0
481 * it's just the icmp type << 8 | icmp code. After adjustment
482 * header points to the first 8 bytes of the tcp header. We need
483 * to find the appropriate port.
484 *
485 * The locking strategy used here is very "optimistic". When
486 * someone else accesses the socket the ICMP is just dropped
487 * and for some paths there is no check at all.
488 * A more general error queue to queue errors for later handling
489 * is probably better.
490 *
491 */
492
tcp_v4_err(struct sk_buff * skb,u32 info)493 int tcp_v4_err(struct sk_buff *skb, u32 info)
494 {
495 const struct iphdr *iph = (const struct iphdr *)skb->data;
496 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
497 struct tcp_sock *tp;
498 const int type = icmp_hdr(skb)->type;
499 const int code = icmp_hdr(skb)->code;
500 struct sock *sk;
501 struct request_sock *fastopen;
502 u32 seq, snd_una;
503 int err;
504 struct net *net = dev_net(skb->dev);
505
506 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
507 iph->daddr, th->dest, iph->saddr,
508 ntohs(th->source), inet_iif(skb), 0);
509 if (!sk) {
510 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
511 return -ENOENT;
512 }
513 if (sk->sk_state == TCP_TIME_WAIT) {
514 /* To increase the counter of ignored icmps for TCP-AO */
515 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
516 inet_twsk_put(inet_twsk(sk));
517 return 0;
518 }
519 seq = ntohl(th->seq);
520 if (sk->sk_state == TCP_NEW_SYN_RECV) {
521 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
522 type == ICMP_TIME_EXCEEDED ||
523 (type == ICMP_DEST_UNREACH &&
524 (code == ICMP_NET_UNREACH ||
525 code == ICMP_HOST_UNREACH)));
526 return 0;
527 }
528
529 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
530 sock_put(sk);
531 return 0;
532 }
533
534 bh_lock_sock(sk);
535 /* If too many ICMPs get dropped on busy
536 * servers this needs to be solved differently.
537 * We do take care of PMTU discovery (RFC1191) special case :
538 * we can receive locally generated ICMP messages while socket is held.
539 */
540 if (sock_owned_by_user(sk)) {
541 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
542 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
543 }
544 if (sk->sk_state == TCP_CLOSE)
545 goto out;
546
547 if (static_branch_unlikely(&ip4_min_ttl)) {
548 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
549 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
550 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
551 goto out;
552 }
553 }
554
555 tp = tcp_sk(sk);
556 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
557 fastopen = rcu_dereference(tp->fastopen_rsk);
558 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
559 if (sk->sk_state != TCP_LISTEN &&
560 !between(seq, snd_una, tp->snd_nxt)) {
561 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
562 goto out;
563 }
564
565 switch (type) {
566 case ICMP_REDIRECT:
567 if (!sock_owned_by_user(sk))
568 do_redirect(skb, sk);
569 goto out;
570 case ICMP_SOURCE_QUENCH:
571 /* Just silently ignore these. */
572 goto out;
573 case ICMP_PARAMETERPROB:
574 err = EPROTO;
575 break;
576 case ICMP_DEST_UNREACH:
577 if (code > NR_ICMP_UNREACH)
578 goto out;
579
580 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
581 /* We are not interested in TCP_LISTEN and open_requests
582 * (SYN-ACKs send out by Linux are always <576bytes so
583 * they should go through unfragmented).
584 */
585 if (sk->sk_state == TCP_LISTEN)
586 goto out;
587
588 WRITE_ONCE(tp->mtu_info, info);
589 if (!sock_owned_by_user(sk)) {
590 tcp_v4_mtu_reduced(sk);
591 } else {
592 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
593 sock_hold(sk);
594 }
595 goto out;
596 }
597
598 err = icmp_err_convert[code].errno;
599 /* check if this ICMP message allows revert of backoff.
600 * (see RFC 6069)
601 */
602 if (!fastopen &&
603 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
604 tcp_ld_RTO_revert(sk, seq);
605 break;
606 case ICMP_TIME_EXCEEDED:
607 err = EHOSTUNREACH;
608 break;
609 default:
610 goto out;
611 }
612
613 switch (sk->sk_state) {
614 case TCP_SYN_SENT:
615 case TCP_SYN_RECV:
616 /* Only in fast or simultaneous open. If a fast open socket is
617 * already accepted it is treated as a connected one below.
618 */
619 if (fastopen && !fastopen->sk)
620 break;
621
622 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
623
624 if (!sock_owned_by_user(sk))
625 tcp_done_with_error(sk, err);
626 else
627 WRITE_ONCE(sk->sk_err_soft, err);
628 goto out;
629 }
630
631 /* If we've already connected we will keep trying
632 * until we time out, or the user gives up.
633 *
634 * rfc1122 4.2.3.9 allows to consider as hard errors
635 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
636 * but it is obsoleted by pmtu discovery).
637 *
638 * Note, that in modern internet, where routing is unreliable
639 * and in each dark corner broken firewalls sit, sending random
640 * errors ordered by their masters even this two messages finally lose
641 * their original sense (even Linux sends invalid PORT_UNREACHs)
642 *
643 * Now we are in compliance with RFCs.
644 * --ANK (980905)
645 */
646
647 if (!sock_owned_by_user(sk) &&
648 inet_test_bit(RECVERR, sk)) {
649 WRITE_ONCE(sk->sk_err, err);
650 sk_error_report(sk);
651 } else { /* Only an error on timeout */
652 WRITE_ONCE(sk->sk_err_soft, err);
653 }
654
655 out:
656 bh_unlock_sock(sk);
657 sock_put(sk);
658 return 0;
659 }
660
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)661 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
662 {
663 struct tcphdr *th = tcp_hdr(skb);
664
665 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
666 skb->csum_start = skb_transport_header(skb) - skb->head;
667 skb->csum_offset = offsetof(struct tcphdr, check);
668 }
669
670 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)671 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
672 {
673 const struct inet_sock *inet = inet_sk(sk);
674
675 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
676 }
677 EXPORT_SYMBOL(tcp_v4_send_check);
678
679 #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
680
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])681 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
682 const struct tcp_ao_hdr *aoh,
683 struct ip_reply_arg *arg, struct tcphdr *reply,
684 __be32 reply_options[REPLY_OPTIONS_LEN])
685 {
686 #ifdef CONFIG_TCP_AO
687 int sdif = tcp_v4_sdif(skb);
688 int dif = inet_iif(skb);
689 int l3index = sdif ? dif : 0;
690 bool allocated_traffic_key;
691 struct tcp_ao_key *key;
692 char *traffic_key;
693 bool drop = true;
694 u32 ao_sne = 0;
695 u8 keyid;
696
697 rcu_read_lock();
698 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
699 &key, &traffic_key, &allocated_traffic_key,
700 &keyid, &ao_sne))
701 goto out;
702
703 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
704 (aoh->rnext_keyid << 8) | keyid);
705 arg->iov[0].iov_len += tcp_ao_len_aligned(key);
706 reply->doff = arg->iov[0].iov_len / 4;
707
708 if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
709 key, traffic_key,
710 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
711 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
712 reply, ao_sne))
713 goto out;
714 drop = false;
715 out:
716 rcu_read_unlock();
717 if (allocated_traffic_key)
718 kfree(traffic_key);
719 return drop;
720 #else
721 return true;
722 #endif
723 }
724
725 /*
726 * This routine will send an RST to the other tcp.
727 *
728 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
729 * for reset.
730 * Answer: if a packet caused RST, it is not for a socket
731 * existing in our system, if it is matched to a socket,
732 * it is just duplicate segment or bug in other side's TCP.
733 * So that we build reply only basing on parameters
734 * arrived with segment.
735 * Exception: precedence violation. We do not implement it in any case.
736 */
737
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)738 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
739 enum sk_rst_reason reason)
740 {
741 const struct tcphdr *th = tcp_hdr(skb);
742 struct {
743 struct tcphdr th;
744 __be32 opt[REPLY_OPTIONS_LEN];
745 } rep;
746 const __u8 *md5_hash_location = NULL;
747 const struct tcp_ao_hdr *aoh;
748 struct ip_reply_arg arg;
749 #ifdef CONFIG_TCP_MD5SIG
750 struct tcp_md5sig_key *key = NULL;
751 unsigned char newhash[16];
752 struct sock *sk1 = NULL;
753 int genhash;
754 #endif
755 u64 transmit_time = 0;
756 struct sock *ctl_sk;
757 struct net *net;
758 u32 txhash = 0;
759
760 /* Never send a reset in response to a reset. */
761 if (th->rst)
762 return;
763
764 /* If sk not NULL, it means we did a successful lookup and incoming
765 * route had to be correct. prequeue might have dropped our dst.
766 */
767 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
768 return;
769
770 /* Swap the send and the receive. */
771 memset(&rep, 0, sizeof(rep));
772 rep.th.dest = th->source;
773 rep.th.source = th->dest;
774 rep.th.doff = sizeof(struct tcphdr) / 4;
775 rep.th.rst = 1;
776
777 if (th->ack) {
778 rep.th.seq = th->ack_seq;
779 } else {
780 rep.th.ack = 1;
781 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
782 skb->len - (th->doff << 2));
783 }
784
785 memset(&arg, 0, sizeof(arg));
786 arg.iov[0].iov_base = (unsigned char *)&rep;
787 arg.iov[0].iov_len = sizeof(rep.th);
788
789 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
790
791 /* Invalid TCP option size or twice included auth */
792 if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
793 return;
794
795 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
796 return;
797
798 #ifdef CONFIG_TCP_MD5SIG
799 rcu_read_lock();
800 if (sk && sk_fullsock(sk)) {
801 const union tcp_md5_addr *addr;
802 int l3index;
803
804 /* sdif set, means packet ingressed via a device
805 * in an L3 domain and inet_iif is set to it.
806 */
807 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
808 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
809 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
810 } else if (md5_hash_location) {
811 const union tcp_md5_addr *addr;
812 int sdif = tcp_v4_sdif(skb);
813 int dif = inet_iif(skb);
814 int l3index;
815
816 /*
817 * active side is lost. Try to find listening socket through
818 * source port, and then find md5 key through listening socket.
819 * we are not loose security here:
820 * Incoming packet is checked with md5 hash with finding key,
821 * no RST generated if md5 hash doesn't match.
822 */
823 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
824 NULL, 0, ip_hdr(skb)->saddr,
825 th->source, ip_hdr(skb)->daddr,
826 ntohs(th->source), dif, sdif);
827 /* don't send rst if it can't find key */
828 if (!sk1)
829 goto out;
830
831 /* sdif set, means packet ingressed via a device
832 * in an L3 domain and dif is set to it.
833 */
834 l3index = sdif ? dif : 0;
835 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
836 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
837 if (!key)
838 goto out;
839
840
841 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
842 if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
843 goto out;
844
845 }
846
847 if (key) {
848 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
849 (TCPOPT_NOP << 16) |
850 (TCPOPT_MD5SIG << 8) |
851 TCPOLEN_MD5SIG);
852 /* Update length and the length the header thinks exists */
853 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
854 rep.th.doff = arg.iov[0].iov_len / 4;
855
856 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
857 key, ip_hdr(skb)->saddr,
858 ip_hdr(skb)->daddr, &rep.th);
859 }
860 #endif
861 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
862 if (rep.opt[0] == 0) {
863 __be32 mrst = mptcp_reset_option(skb);
864
865 if (mrst) {
866 rep.opt[0] = mrst;
867 arg.iov[0].iov_len += sizeof(mrst);
868 rep.th.doff = arg.iov[0].iov_len / 4;
869 }
870 }
871
872 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
873 ip_hdr(skb)->saddr, /* XXX */
874 arg.iov[0].iov_len, IPPROTO_TCP, 0);
875 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
876 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
877
878 /* When socket is gone, all binding information is lost.
879 * routing might fail in this case. No choice here, if we choose to force
880 * input interface, we will misroute in case of asymmetric route.
881 */
882 if (sk)
883 arg.bound_dev_if = sk->sk_bound_dev_if;
884
885 trace_tcp_send_reset(sk, skb, reason);
886
887 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
888 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
889
890 arg.tos = ip_hdr(skb)->tos;
891 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
892 local_bh_disable();
893 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
894 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
895
896 sock_net_set(ctl_sk, net);
897 if (sk) {
898 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
899 inet_twsk(sk)->tw_mark : sk->sk_mark;
900 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
901 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
902 transmit_time = tcp_transmit_time(sk);
903 xfrm_sk_clone_policy(ctl_sk, sk);
904 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
905 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
906 } else {
907 ctl_sk->sk_mark = 0;
908 ctl_sk->sk_priority = 0;
909 }
910 ip_send_unicast_reply(ctl_sk,
911 skb, &TCP_SKB_CB(skb)->header.h4.opt,
912 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
913 &arg, arg.iov[0].iov_len,
914 transmit_time, txhash);
915
916 xfrm_sk_free_policy(ctl_sk);
917 sock_net_set(ctl_sk, &init_net);
918 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
919 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
920 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
921 local_bh_enable();
922
923 #ifdef CONFIG_TCP_MD5SIG
924 out:
925 rcu_read_unlock();
926 #endif
927 }
928
929 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
930 outside socket context is ugly, certainly. What can I do?
931 */
932
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)933 static void tcp_v4_send_ack(const struct sock *sk,
934 struct sk_buff *skb, u32 seq, u32 ack,
935 u32 win, u32 tsval, u32 tsecr, int oif,
936 struct tcp_key *key,
937 int reply_flags, u8 tos, u32 txhash)
938 {
939 const struct tcphdr *th = tcp_hdr(skb);
940 struct {
941 struct tcphdr th;
942 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
943 } rep;
944 struct net *net = sock_net(sk);
945 struct ip_reply_arg arg;
946 struct sock *ctl_sk;
947 u64 transmit_time;
948
949 memset(&rep.th, 0, sizeof(struct tcphdr));
950 memset(&arg, 0, sizeof(arg));
951
952 arg.iov[0].iov_base = (unsigned char *)&rep;
953 arg.iov[0].iov_len = sizeof(rep.th);
954 if (tsecr) {
955 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
956 (TCPOPT_TIMESTAMP << 8) |
957 TCPOLEN_TIMESTAMP);
958 rep.opt[1] = htonl(tsval);
959 rep.opt[2] = htonl(tsecr);
960 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
961 }
962
963 /* Swap the send and the receive. */
964 rep.th.dest = th->source;
965 rep.th.source = th->dest;
966 rep.th.doff = arg.iov[0].iov_len / 4;
967 rep.th.seq = htonl(seq);
968 rep.th.ack_seq = htonl(ack);
969 rep.th.ack = 1;
970 rep.th.window = htons(win);
971
972 #ifdef CONFIG_TCP_MD5SIG
973 if (tcp_key_is_md5(key)) {
974 int offset = (tsecr) ? 3 : 0;
975
976 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
977 (TCPOPT_NOP << 16) |
978 (TCPOPT_MD5SIG << 8) |
979 TCPOLEN_MD5SIG);
980 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
981 rep.th.doff = arg.iov[0].iov_len/4;
982
983 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
984 key->md5_key, ip_hdr(skb)->saddr,
985 ip_hdr(skb)->daddr, &rep.th);
986 }
987 #endif
988 #ifdef CONFIG_TCP_AO
989 if (tcp_key_is_ao(key)) {
990 int offset = (tsecr) ? 3 : 0;
991
992 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
993 (tcp_ao_len(key->ao_key) << 16) |
994 (key->ao_key->sndid << 8) |
995 key->rcv_next);
996 arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
997 rep.th.doff = arg.iov[0].iov_len / 4;
998
999 tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
1000 key->ao_key, key->traffic_key,
1001 (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1002 (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1003 &rep.th, key->sne);
1004 }
1005 #endif
1006 arg.flags = reply_flags;
1007 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
1008 ip_hdr(skb)->saddr, /* XXX */
1009 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1010 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1011 if (oif)
1012 arg.bound_dev_if = oif;
1013 arg.tos = tos;
1014 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
1015 local_bh_disable();
1016 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1017 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1018 sock_net_set(ctl_sk, net);
1019 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1020 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1021 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1022 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1023 transmit_time = tcp_transmit_time(sk);
1024 ip_send_unicast_reply(ctl_sk,
1025 skb, &TCP_SKB_CB(skb)->header.h4.opt,
1026 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1027 &arg, arg.iov[0].iov_len,
1028 transmit_time, txhash);
1029
1030 sock_net_set(ctl_sk, &init_net);
1031 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1032 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1033 local_bh_enable();
1034 }
1035
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)1036 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1037 {
1038 struct inet_timewait_sock *tw = inet_twsk(sk);
1039 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1040 struct tcp_key key = {};
1041 #ifdef CONFIG_TCP_AO
1042 struct tcp_ao_info *ao_info;
1043
1044 if (static_branch_unlikely(&tcp_ao_needed.key)) {
1045 /* FIXME: the segment to-be-acked is not verified yet */
1046 ao_info = rcu_dereference(tcptw->ao_info);
1047 if (ao_info) {
1048 const struct tcp_ao_hdr *aoh;
1049
1050 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1051 inet_twsk_put(tw);
1052 return;
1053 }
1054
1055 if (aoh)
1056 key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
1057 }
1058 }
1059 if (key.ao_key) {
1060 struct tcp_ao_key *rnext_key;
1061
1062 key.traffic_key = snd_other_key(key.ao_key);
1063 key.sne = READ_ONCE(ao_info->snd_sne);
1064 rnext_key = READ_ONCE(ao_info->rnext_key);
1065 key.rcv_next = rnext_key->rcvid;
1066 key.type = TCP_KEY_AO;
1067 #else
1068 if (0) {
1069 #endif
1070 } else if (static_branch_tcp_md5()) {
1071 key.md5_key = tcp_twsk_md5_key(tcptw);
1072 if (key.md5_key)
1073 key.type = TCP_KEY_MD5;
1074 }
1075
1076 tcp_v4_send_ack(sk, skb,
1077 tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1078 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1079 tcp_tw_tsval(tcptw),
1080 READ_ONCE(tcptw->tw_ts_recent),
1081 tw->tw_bound_dev_if, &key,
1082 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1083 tw->tw_tos,
1084 tw->tw_txhash);
1085
1086 inet_twsk_put(tw);
1087 }
1088
1089 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1090 struct request_sock *req)
1091 {
1092 struct tcp_key key = {};
1093
1094 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1095 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1096 */
1097 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1098 tcp_sk(sk)->snd_nxt;
1099
1100 #ifdef CONFIG_TCP_AO
1101 if (static_branch_unlikely(&tcp_ao_needed.key) &&
1102 tcp_rsk_used_ao(req)) {
1103 const union tcp_md5_addr *addr;
1104 const struct tcp_ao_hdr *aoh;
1105 int l3index;
1106
1107 /* Invalid TCP option size or twice included auth */
1108 if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1109 return;
1110 if (!aoh)
1111 return;
1112
1113 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1114 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1115 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1116 aoh->rnext_keyid, -1);
1117 if (unlikely(!key.ao_key)) {
1118 /* Send ACK with any matching MKT for the peer */
1119 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1120 /* Matching key disappeared (user removed the key?)
1121 * let the handshake timeout.
1122 */
1123 if (!key.ao_key) {
1124 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1125 addr,
1126 ntohs(tcp_hdr(skb)->source),
1127 &ip_hdr(skb)->daddr,
1128 ntohs(tcp_hdr(skb)->dest));
1129 return;
1130 }
1131 }
1132 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1133 if (!key.traffic_key)
1134 return;
1135
1136 key.type = TCP_KEY_AO;
1137 key.rcv_next = aoh->keyid;
1138 tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1139 #else
1140 if (0) {
1141 #endif
1142 } else if (static_branch_tcp_md5()) {
1143 const union tcp_md5_addr *addr;
1144 int l3index;
1145
1146 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1147 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1148 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1149 if (key.md5_key)
1150 key.type = TCP_KEY_MD5;
1151 }
1152
1153 tcp_v4_send_ack(sk, skb, seq,
1154 tcp_rsk(req)->rcv_nxt,
1155 tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1156 tcp_rsk_tsval(tcp_rsk(req)),
1157 READ_ONCE(req->ts_recent),
1158 0, &key,
1159 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1160 ip_hdr(skb)->tos,
1161 READ_ONCE(tcp_rsk(req)->txhash));
1162 if (tcp_key_is_ao(&key))
1163 kfree(key.traffic_key);
1164 }
1165
1166 /*
1167 * Send a SYN-ACK after having received a SYN.
1168 * This still operates on a request_sock only, not on a big
1169 * socket.
1170 */
1171 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1172 struct flowi *fl,
1173 struct request_sock *req,
1174 struct tcp_fastopen_cookie *foc,
1175 enum tcp_synack_type synack_type,
1176 struct sk_buff *syn_skb)
1177 {
1178 const struct inet_request_sock *ireq = inet_rsk(req);
1179 struct flowi4 fl4;
1180 int err = -1;
1181 struct sk_buff *skb;
1182 u8 tos;
1183
1184 /* First, grab a route. */
1185 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1186 return -1;
1187
1188 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1189
1190 if (skb) {
1191 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1192
1193 tos = READ_ONCE(inet_sk(sk)->tos);
1194
1195 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1196 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1197 (tos & INET_ECN_MASK);
1198
1199 if (!INET_ECN_is_capable(tos) &&
1200 tcp_bpf_ca_needs_ecn((struct sock *)req))
1201 tos |= INET_ECN_ECT_0;
1202
1203 rcu_read_lock();
1204 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1205 ireq->ir_rmt_addr,
1206 rcu_dereference(ireq->ireq_opt),
1207 tos);
1208 rcu_read_unlock();
1209 err = net_xmit_eval(err);
1210 }
1211
1212 return err;
1213 }
1214
1215 /*
1216 * IPv4 request_sock destructor.
1217 */
1218 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1219 {
1220 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1221 }
1222
1223 #ifdef CONFIG_TCP_MD5SIG
1224 /*
1225 * RFC2385 MD5 checksumming requires a mapping of
1226 * IP address->MD5 Key.
1227 * We need to maintain these in the sk structure.
1228 */
1229
1230 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1231 EXPORT_SYMBOL(tcp_md5_needed);
1232
1233 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1234 {
1235 if (!old)
1236 return true;
1237
1238 /* l3index always overrides non-l3index */
1239 if (old->l3index && new->l3index == 0)
1240 return false;
1241 if (old->l3index == 0 && new->l3index)
1242 return true;
1243
1244 return old->prefixlen < new->prefixlen;
1245 }
1246
1247 /* Find the Key structure for an address. */
1248 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1249 const union tcp_md5_addr *addr,
1250 int family, bool any_l3index)
1251 {
1252 const struct tcp_sock *tp = tcp_sk(sk);
1253 struct tcp_md5sig_key *key;
1254 const struct tcp_md5sig_info *md5sig;
1255 __be32 mask;
1256 struct tcp_md5sig_key *best_match = NULL;
1257 bool match;
1258
1259 /* caller either holds rcu_read_lock() or socket lock */
1260 md5sig = rcu_dereference_check(tp->md5sig_info,
1261 lockdep_sock_is_held(sk));
1262 if (!md5sig)
1263 return NULL;
1264
1265 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1266 lockdep_sock_is_held(sk)) {
1267 if (key->family != family)
1268 continue;
1269 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1270 key->l3index != l3index)
1271 continue;
1272 if (family == AF_INET) {
1273 mask = inet_make_mask(key->prefixlen);
1274 match = (key->addr.a4.s_addr & mask) ==
1275 (addr->a4.s_addr & mask);
1276 #if IS_ENABLED(CONFIG_IPV6)
1277 } else if (family == AF_INET6) {
1278 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1279 key->prefixlen);
1280 #endif
1281 } else {
1282 match = false;
1283 }
1284
1285 if (match && better_md5_match(best_match, key))
1286 best_match = key;
1287 }
1288 return best_match;
1289 }
1290 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1291
1292 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1293 const union tcp_md5_addr *addr,
1294 int family, u8 prefixlen,
1295 int l3index, u8 flags)
1296 {
1297 const struct tcp_sock *tp = tcp_sk(sk);
1298 struct tcp_md5sig_key *key;
1299 unsigned int size = sizeof(struct in_addr);
1300 const struct tcp_md5sig_info *md5sig;
1301
1302 /* caller either holds rcu_read_lock() or socket lock */
1303 md5sig = rcu_dereference_check(tp->md5sig_info,
1304 lockdep_sock_is_held(sk));
1305 if (!md5sig)
1306 return NULL;
1307 #if IS_ENABLED(CONFIG_IPV6)
1308 if (family == AF_INET6)
1309 size = sizeof(struct in6_addr);
1310 #endif
1311 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1312 lockdep_sock_is_held(sk)) {
1313 if (key->family != family)
1314 continue;
1315 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1316 continue;
1317 if (key->l3index != l3index)
1318 continue;
1319 if (!memcmp(&key->addr, addr, size) &&
1320 key->prefixlen == prefixlen)
1321 return key;
1322 }
1323 return NULL;
1324 }
1325
1326 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1327 const struct sock *addr_sk)
1328 {
1329 const union tcp_md5_addr *addr;
1330 int l3index;
1331
1332 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1333 addr_sk->sk_bound_dev_if);
1334 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1335 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1336 }
1337 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1338
1339 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1340 {
1341 struct tcp_sock *tp = tcp_sk(sk);
1342 struct tcp_md5sig_info *md5sig;
1343
1344 md5sig = kmalloc(sizeof(*md5sig), gfp);
1345 if (!md5sig)
1346 return -ENOMEM;
1347
1348 sk_gso_disable(sk);
1349 INIT_HLIST_HEAD(&md5sig->head);
1350 rcu_assign_pointer(tp->md5sig_info, md5sig);
1351 return 0;
1352 }
1353
1354 /* This can be called on a newly created socket, from other files */
1355 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1356 int family, u8 prefixlen, int l3index, u8 flags,
1357 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1358 {
1359 /* Add Key to the list */
1360 struct tcp_md5sig_key *key;
1361 struct tcp_sock *tp = tcp_sk(sk);
1362 struct tcp_md5sig_info *md5sig;
1363
1364 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1365 if (key) {
1366 /* Pre-existing entry - just update that one.
1367 * Note that the key might be used concurrently.
1368 * data_race() is telling kcsan that we do not care of
1369 * key mismatches, since changing MD5 key on live flows
1370 * can lead to packet drops.
1371 */
1372 data_race(memcpy(key->key, newkey, newkeylen));
1373
1374 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1375 * Also note that a reader could catch new key->keylen value
1376 * but old key->key[], this is the reason we use __GFP_ZERO
1377 * at sock_kmalloc() time below these lines.
1378 */
1379 WRITE_ONCE(key->keylen, newkeylen);
1380
1381 return 0;
1382 }
1383
1384 md5sig = rcu_dereference_protected(tp->md5sig_info,
1385 lockdep_sock_is_held(sk));
1386
1387 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1388 if (!key)
1389 return -ENOMEM;
1390
1391 memcpy(key->key, newkey, newkeylen);
1392 key->keylen = newkeylen;
1393 key->family = family;
1394 key->prefixlen = prefixlen;
1395 key->l3index = l3index;
1396 key->flags = flags;
1397 memcpy(&key->addr, addr,
1398 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1399 sizeof(struct in_addr));
1400 hlist_add_head_rcu(&key->node, &md5sig->head);
1401 return 0;
1402 }
1403
1404 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1405 int family, u8 prefixlen, int l3index, u8 flags,
1406 const u8 *newkey, u8 newkeylen)
1407 {
1408 struct tcp_sock *tp = tcp_sk(sk);
1409
1410 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1411 if (tcp_md5_alloc_sigpool())
1412 return -ENOMEM;
1413
1414 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1415 tcp_md5_release_sigpool();
1416 return -ENOMEM;
1417 }
1418
1419 if (!static_branch_inc(&tcp_md5_needed.key)) {
1420 struct tcp_md5sig_info *md5sig;
1421
1422 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1423 rcu_assign_pointer(tp->md5sig_info, NULL);
1424 kfree_rcu(md5sig, rcu);
1425 tcp_md5_release_sigpool();
1426 return -EUSERS;
1427 }
1428 }
1429
1430 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1431 newkey, newkeylen, GFP_KERNEL);
1432 }
1433 EXPORT_SYMBOL(tcp_md5_do_add);
1434
1435 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1436 int family, u8 prefixlen, int l3index,
1437 struct tcp_md5sig_key *key)
1438 {
1439 struct tcp_sock *tp = tcp_sk(sk);
1440
1441 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1442 tcp_md5_add_sigpool();
1443
1444 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
1445 tcp_md5_release_sigpool();
1446 return -ENOMEM;
1447 }
1448
1449 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1450 struct tcp_md5sig_info *md5sig;
1451
1452 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1453 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1454 rcu_assign_pointer(tp->md5sig_info, NULL);
1455 kfree_rcu(md5sig, rcu);
1456 tcp_md5_release_sigpool();
1457 return -EUSERS;
1458 }
1459 }
1460
1461 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1462 key->flags, key->key, key->keylen,
1463 sk_gfp_mask(sk, GFP_ATOMIC));
1464 }
1465 EXPORT_SYMBOL(tcp_md5_key_copy);
1466
1467 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1468 u8 prefixlen, int l3index, u8 flags)
1469 {
1470 struct tcp_md5sig_key *key;
1471
1472 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1473 if (!key)
1474 return -ENOENT;
1475 hlist_del_rcu(&key->node);
1476 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1477 kfree_rcu(key, rcu);
1478 return 0;
1479 }
1480 EXPORT_SYMBOL(tcp_md5_do_del);
1481
1482 void tcp_clear_md5_list(struct sock *sk)
1483 {
1484 struct tcp_sock *tp = tcp_sk(sk);
1485 struct tcp_md5sig_key *key;
1486 struct hlist_node *n;
1487 struct tcp_md5sig_info *md5sig;
1488
1489 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1490
1491 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1492 hlist_del_rcu(&key->node);
1493 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1494 kfree_rcu(key, rcu);
1495 }
1496 }
1497
1498 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1499 sockptr_t optval, int optlen)
1500 {
1501 struct tcp_md5sig cmd;
1502 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1503 const union tcp_md5_addr *addr;
1504 u8 prefixlen = 32;
1505 int l3index = 0;
1506 bool l3flag;
1507 u8 flags;
1508
1509 if (optlen < sizeof(cmd))
1510 return -EINVAL;
1511
1512 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1513 return -EFAULT;
1514
1515 if (sin->sin_family != AF_INET)
1516 return -EINVAL;
1517
1518 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1519 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1520
1521 if (optname == TCP_MD5SIG_EXT &&
1522 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1523 prefixlen = cmd.tcpm_prefixlen;
1524 if (prefixlen > 32)
1525 return -EINVAL;
1526 }
1527
1528 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1529 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1530 struct net_device *dev;
1531
1532 rcu_read_lock();
1533 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1534 if (dev && netif_is_l3_master(dev))
1535 l3index = dev->ifindex;
1536
1537 rcu_read_unlock();
1538
1539 /* ok to reference set/not set outside of rcu;
1540 * right now device MUST be an L3 master
1541 */
1542 if (!dev || !l3index)
1543 return -EINVAL;
1544 }
1545
1546 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1547
1548 if (!cmd.tcpm_keylen)
1549 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1550
1551 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1552 return -EINVAL;
1553
1554 /* Don't allow keys for peers that have a matching TCP-AO key.
1555 * See the comment in tcp_ao_add_cmd()
1556 */
1557 if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1558 return -EKEYREJECTED;
1559
1560 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1561 cmd.tcpm_key, cmd.tcpm_keylen);
1562 }
1563
1564 static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1565 __be32 daddr, __be32 saddr,
1566 const struct tcphdr *th, int nbytes)
1567 {
1568 struct tcp4_pseudohdr *bp;
1569 struct scatterlist sg;
1570 struct tcphdr *_th;
1571
1572 bp = hp->scratch;
1573 bp->saddr = saddr;
1574 bp->daddr = daddr;
1575 bp->pad = 0;
1576 bp->protocol = IPPROTO_TCP;
1577 bp->len = cpu_to_be16(nbytes);
1578
1579 _th = (struct tcphdr *)(bp + 1);
1580 memcpy(_th, th, sizeof(*th));
1581 _th->check = 0;
1582
1583 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1584 ahash_request_set_crypt(hp->req, &sg, NULL,
1585 sizeof(*bp) + sizeof(*th));
1586 return crypto_ahash_update(hp->req);
1587 }
1588
1589 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1590 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1591 {
1592 struct tcp_sigpool hp;
1593
1594 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1595 goto clear_hash_nostart;
1596
1597 if (crypto_ahash_init(hp.req))
1598 goto clear_hash;
1599 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
1600 goto clear_hash;
1601 if (tcp_md5_hash_key(&hp, key))
1602 goto clear_hash;
1603 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1604 if (crypto_ahash_final(hp.req))
1605 goto clear_hash;
1606
1607 tcp_sigpool_end(&hp);
1608 return 0;
1609
1610 clear_hash:
1611 tcp_sigpool_end(&hp);
1612 clear_hash_nostart:
1613 memset(md5_hash, 0, 16);
1614 return 1;
1615 }
1616
1617 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1618 const struct sock *sk,
1619 const struct sk_buff *skb)
1620 {
1621 const struct tcphdr *th = tcp_hdr(skb);
1622 struct tcp_sigpool hp;
1623 __be32 saddr, daddr;
1624
1625 if (sk) { /* valid for establish/request sockets */
1626 saddr = sk->sk_rcv_saddr;
1627 daddr = sk->sk_daddr;
1628 } else {
1629 const struct iphdr *iph = ip_hdr(skb);
1630 saddr = iph->saddr;
1631 daddr = iph->daddr;
1632 }
1633
1634 if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
1635 goto clear_hash_nostart;
1636
1637 if (crypto_ahash_init(hp.req))
1638 goto clear_hash;
1639
1640 if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
1641 goto clear_hash;
1642 if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
1643 goto clear_hash;
1644 if (tcp_md5_hash_key(&hp, key))
1645 goto clear_hash;
1646 ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
1647 if (crypto_ahash_final(hp.req))
1648 goto clear_hash;
1649
1650 tcp_sigpool_end(&hp);
1651 return 0;
1652
1653 clear_hash:
1654 tcp_sigpool_end(&hp);
1655 clear_hash_nostart:
1656 memset(md5_hash, 0, 16);
1657 return 1;
1658 }
1659 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1660
1661 #endif
1662
1663 static void tcp_v4_init_req(struct request_sock *req,
1664 const struct sock *sk_listener,
1665 struct sk_buff *skb)
1666 {
1667 struct inet_request_sock *ireq = inet_rsk(req);
1668 struct net *net = sock_net(sk_listener);
1669
1670 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1671 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1672 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1673 }
1674
1675 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1676 struct sk_buff *skb,
1677 struct flowi *fl,
1678 struct request_sock *req,
1679 u32 tw_isn)
1680 {
1681 tcp_v4_init_req(req, sk, skb);
1682
1683 if (security_inet_conn_request(sk, skb, req))
1684 return NULL;
1685
1686 return inet_csk_route_req(sk, &fl->u.ip4, req);
1687 }
1688
1689 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1690 .family = PF_INET,
1691 .obj_size = sizeof(struct tcp_request_sock),
1692 .rtx_syn_ack = tcp_rtx_synack,
1693 .send_ack = tcp_v4_reqsk_send_ack,
1694 .destructor = tcp_v4_reqsk_destructor,
1695 .send_reset = tcp_v4_send_reset,
1696 .syn_ack_timeout = tcp_syn_ack_timeout,
1697 };
1698
1699 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1700 .mss_clamp = TCP_MSS_DEFAULT,
1701 #ifdef CONFIG_TCP_MD5SIG
1702 .req_md5_lookup = tcp_v4_md5_lookup,
1703 .calc_md5_hash = tcp_v4_md5_hash_skb,
1704 #endif
1705 #ifdef CONFIG_TCP_AO
1706 .ao_lookup = tcp_v4_ao_lookup_rsk,
1707 .ao_calc_key = tcp_v4_ao_calc_key_rsk,
1708 .ao_synack_hash = tcp_v4_ao_synack_hash,
1709 #endif
1710 #ifdef CONFIG_SYN_COOKIES
1711 .cookie_init_seq = cookie_v4_init_sequence,
1712 #endif
1713 .route_req = tcp_v4_route_req,
1714 .init_seq = tcp_v4_init_seq,
1715 .init_ts_off = tcp_v4_init_ts_off,
1716 .send_synack = tcp_v4_send_synack,
1717 };
1718
1719 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1720 {
1721 /* Never answer to SYNs send to broadcast or multicast */
1722 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1723 goto drop;
1724
1725 return tcp_conn_request(&tcp_request_sock_ops,
1726 &tcp_request_sock_ipv4_ops, sk, skb);
1727
1728 drop:
1729 tcp_listendrop(sk);
1730 return 0;
1731 }
1732 EXPORT_SYMBOL(tcp_v4_conn_request);
1733
1734
1735 /*
1736 * The three way handshake has completed - we got a valid synack -
1737 * now create the new socket.
1738 */
1739 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1740 struct request_sock *req,
1741 struct dst_entry *dst,
1742 struct request_sock *req_unhash,
1743 bool *own_req)
1744 {
1745 struct inet_request_sock *ireq;
1746 bool found_dup_sk = false;
1747 struct inet_sock *newinet;
1748 struct tcp_sock *newtp;
1749 struct sock *newsk;
1750 #ifdef CONFIG_TCP_MD5SIG
1751 const union tcp_md5_addr *addr;
1752 struct tcp_md5sig_key *key;
1753 int l3index;
1754 #endif
1755 struct ip_options_rcu *inet_opt;
1756
1757 if (sk_acceptq_is_full(sk))
1758 goto exit_overflow;
1759
1760 newsk = tcp_create_openreq_child(sk, req, skb);
1761 if (!newsk)
1762 goto exit_nonewsk;
1763
1764 newsk->sk_gso_type = SKB_GSO_TCPV4;
1765 inet_sk_rx_dst_set(newsk, skb);
1766
1767 newtp = tcp_sk(newsk);
1768 newinet = inet_sk(newsk);
1769 ireq = inet_rsk(req);
1770 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1771 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1772 newsk->sk_bound_dev_if = ireq->ir_iif;
1773 newinet->inet_saddr = ireq->ir_loc_addr;
1774 inet_opt = rcu_dereference(ireq->ireq_opt);
1775 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1776 newinet->mc_index = inet_iif(skb);
1777 newinet->mc_ttl = ip_hdr(skb)->ttl;
1778 newinet->rcv_tos = ip_hdr(skb)->tos;
1779 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1780 if (inet_opt)
1781 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1782 atomic_set(&newinet->inet_id, get_random_u16());
1783
1784 /* Set ToS of the new socket based upon the value of incoming SYN.
1785 * ECT bits are set later in tcp_init_transfer().
1786 */
1787 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1788 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1789
1790 if (!dst) {
1791 dst = inet_csk_route_child_sock(sk, newsk, req);
1792 if (!dst)
1793 goto put_and_exit;
1794 } else {
1795 /* syncookie case : see end of cookie_v4_check() */
1796 }
1797 sk_setup_caps(newsk, dst);
1798
1799 tcp_ca_openreq_child(newsk, dst);
1800
1801 tcp_sync_mss(newsk, dst_mtu(dst));
1802 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1803
1804 tcp_initialize_rcv_mss(newsk);
1805
1806 #ifdef CONFIG_TCP_MD5SIG
1807 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1808 /* Copy over the MD5 key from the original socket */
1809 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1810 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1811 if (key && !tcp_rsk_used_ao(req)) {
1812 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1813 goto put_and_exit;
1814 sk_gso_disable(newsk);
1815 }
1816 #endif
1817 #ifdef CONFIG_TCP_AO
1818 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1819 goto put_and_exit; /* OOM, release back memory */
1820 #endif
1821
1822 if (__inet_inherit_port(sk, newsk) < 0)
1823 goto put_and_exit;
1824 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1825 &found_dup_sk);
1826 if (likely(*own_req)) {
1827 tcp_move_syn(newtp, req);
1828 ireq->ireq_opt = NULL;
1829 } else {
1830 newinet->inet_opt = NULL;
1831
1832 if (!req_unhash && found_dup_sk) {
1833 /* This code path should only be executed in the
1834 * syncookie case only
1835 */
1836 bh_unlock_sock(newsk);
1837 sock_put(newsk);
1838 newsk = NULL;
1839 }
1840 }
1841 return newsk;
1842
1843 exit_overflow:
1844 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1845 exit_nonewsk:
1846 dst_release(dst);
1847 exit:
1848 tcp_listendrop(sk);
1849 return NULL;
1850 put_and_exit:
1851 newinet->inet_opt = NULL;
1852 inet_csk_prepare_forced_close(newsk);
1853 tcp_done(newsk);
1854 goto exit;
1855 }
1856 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1857
1858 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1859 {
1860 #ifdef CONFIG_SYN_COOKIES
1861 const struct tcphdr *th = tcp_hdr(skb);
1862
1863 if (!th->syn)
1864 sk = cookie_v4_check(sk, skb);
1865 #endif
1866 return sk;
1867 }
1868
1869 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1870 struct tcphdr *th, u32 *cookie)
1871 {
1872 u16 mss = 0;
1873 #ifdef CONFIG_SYN_COOKIES
1874 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1875 &tcp_request_sock_ipv4_ops, sk, th);
1876 if (mss) {
1877 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1878 tcp_synq_overflow(sk);
1879 }
1880 #endif
1881 return mss;
1882 }
1883
1884 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1885 u32));
1886 /* The socket must have it's spinlock held when we get
1887 * here, unless it is a TCP_LISTEN socket.
1888 *
1889 * We have a potential double-lock case here, so even when
1890 * doing backlog processing we use the BH locking scheme.
1891 * This is because we cannot sleep with the original spinlock
1892 * held.
1893 */
1894 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1895 {
1896 enum skb_drop_reason reason;
1897 struct sock *rsk;
1898
1899 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1900 struct dst_entry *dst;
1901
1902 dst = rcu_dereference_protected(sk->sk_rx_dst,
1903 lockdep_sock_is_held(sk));
1904
1905 sock_rps_save_rxhash(sk, skb);
1906 sk_mark_napi_id(sk, skb);
1907 if (dst) {
1908 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1909 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1910 dst, 0)) {
1911 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1912 dst_release(dst);
1913 }
1914 }
1915 tcp_rcv_established(sk, skb);
1916 return 0;
1917 }
1918
1919 if (tcp_checksum_complete(skb))
1920 goto csum_err;
1921
1922 if (sk->sk_state == TCP_LISTEN) {
1923 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1924
1925 if (!nsk)
1926 return 0;
1927 if (nsk != sk) {
1928 reason = tcp_child_process(sk, nsk, skb);
1929 if (reason) {
1930 rsk = nsk;
1931 goto reset;
1932 }
1933 return 0;
1934 }
1935 } else
1936 sock_rps_save_rxhash(sk, skb);
1937
1938 reason = tcp_rcv_state_process(sk, skb);
1939 if (reason) {
1940 rsk = sk;
1941 goto reset;
1942 }
1943 return 0;
1944
1945 reset:
1946 tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1947 discard:
1948 sk_skb_reason_drop(sk, skb, reason);
1949 /* Be careful here. If this function gets more complicated and
1950 * gcc suffers from register pressure on the x86, sk (in %ebx)
1951 * might be destroyed here. This current version compiles correctly,
1952 * but you have been warned.
1953 */
1954 return 0;
1955
1956 csum_err:
1957 reason = SKB_DROP_REASON_TCP_CSUM;
1958 trace_tcp_bad_csum(skb);
1959 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1960 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1961 goto discard;
1962 }
1963 EXPORT_SYMBOL(tcp_v4_do_rcv);
1964
1965 int tcp_v4_early_demux(struct sk_buff *skb)
1966 {
1967 struct net *net = dev_net(skb->dev);
1968 const struct iphdr *iph;
1969 const struct tcphdr *th;
1970 struct sock *sk;
1971
1972 if (skb->pkt_type != PACKET_HOST)
1973 return 0;
1974
1975 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1976 return 0;
1977
1978 iph = ip_hdr(skb);
1979 th = tcp_hdr(skb);
1980
1981 if (th->doff < sizeof(struct tcphdr) / 4)
1982 return 0;
1983
1984 sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1985 iph->saddr, th->source,
1986 iph->daddr, ntohs(th->dest),
1987 skb->skb_iif, inet_sdif(skb));
1988 if (sk) {
1989 skb->sk = sk;
1990 skb->destructor = sock_edemux;
1991 if (sk_fullsock(sk)) {
1992 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1993
1994 if (dst)
1995 dst = dst_check(dst, 0);
1996 if (dst &&
1997 sk->sk_rx_dst_ifindex == skb->skb_iif)
1998 skb_dst_set_noref(skb, dst);
1999 }
2000 }
2001 return 0;
2002 }
2003
2004 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2005 enum skb_drop_reason *reason)
2006 {
2007 u32 tail_gso_size, tail_gso_segs;
2008 struct skb_shared_info *shinfo;
2009 const struct tcphdr *th;
2010 struct tcphdr *thtail;
2011 struct sk_buff *tail;
2012 unsigned int hdrlen;
2013 bool fragstolen;
2014 u32 gso_segs;
2015 u32 gso_size;
2016 u64 limit;
2017 int delta;
2018
2019 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2020 * we can fix skb->truesize to its real value to avoid future drops.
2021 * This is valid because skb is not yet charged to the socket.
2022 * It has been noticed pure SACK packets were sometimes dropped
2023 * (if cooked by drivers without copybreak feature).
2024 */
2025 skb_condense(skb);
2026
2027 skb_dst_drop(skb);
2028
2029 if (unlikely(tcp_checksum_complete(skb))) {
2030 bh_unlock_sock(sk);
2031 trace_tcp_bad_csum(skb);
2032 *reason = SKB_DROP_REASON_TCP_CSUM;
2033 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2034 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2035 return true;
2036 }
2037
2038 /* Attempt coalescing to last skb in backlog, even if we are
2039 * above the limits.
2040 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2041 */
2042 th = (const struct tcphdr *)skb->data;
2043 hdrlen = th->doff * 4;
2044
2045 tail = sk->sk_backlog.tail;
2046 if (!tail)
2047 goto no_coalesce;
2048 thtail = (struct tcphdr *)tail->data;
2049
2050 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2051 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2052 ((TCP_SKB_CB(tail)->tcp_flags |
2053 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2054 !((TCP_SKB_CB(tail)->tcp_flags &
2055 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2056 ((TCP_SKB_CB(tail)->tcp_flags ^
2057 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
2058 !tcp_skb_can_collapse_rx(tail, skb) ||
2059 thtail->doff != th->doff ||
2060 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
2061 goto no_coalesce;
2062
2063 __skb_pull(skb, hdrlen);
2064
2065 shinfo = skb_shinfo(skb);
2066 gso_size = shinfo->gso_size ?: skb->len;
2067 gso_segs = shinfo->gso_segs ?: 1;
2068
2069 shinfo = skb_shinfo(tail);
2070 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2071 tail_gso_segs = shinfo->gso_segs ?: 1;
2072
2073 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
2074 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2075
2076 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2077 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2078 thtail->window = th->window;
2079 }
2080
2081 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2082 * thtail->fin, so that the fast path in tcp_rcv_established()
2083 * is not entered if we append a packet with a FIN.
2084 * SYN, RST, URG are not present.
2085 * ACK is set on both packets.
2086 * PSH : we do not really care in TCP stack,
2087 * at least for 'GRO' packets.
2088 */
2089 thtail->fin |= th->fin;
2090 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2091
2092 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2093 TCP_SKB_CB(tail)->has_rxtstamp = true;
2094 tail->tstamp = skb->tstamp;
2095 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2096 }
2097
2098 /* Not as strict as GRO. We only need to carry mss max value */
2099 shinfo->gso_size = max(gso_size, tail_gso_size);
2100 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2101
2102 sk->sk_backlog.len += delta;
2103 __NET_INC_STATS(sock_net(sk),
2104 LINUX_MIB_TCPBACKLOGCOALESCE);
2105 kfree_skb_partial(skb, fragstolen);
2106 return false;
2107 }
2108 __skb_push(skb, hdrlen);
2109
2110 no_coalesce:
2111 /* sk->sk_backlog.len is reset only at the end of __release_sock().
2112 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2113 * sk_rcvbuf in normal conditions.
2114 */
2115 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2116
2117 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2118
2119 /* Only socket owner can try to collapse/prune rx queues
2120 * to reduce memory overhead, so add a little headroom here.
2121 * Few sockets backlog are possibly concurrently non empty.
2122 */
2123 limit += 64 * 1024;
2124
2125 limit = min_t(u64, limit, UINT_MAX);
2126
2127 if (unlikely(sk_add_backlog(sk, skb, limit))) {
2128 bh_unlock_sock(sk);
2129 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2130 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2131 return true;
2132 }
2133 return false;
2134 }
2135 EXPORT_SYMBOL(tcp_add_backlog);
2136
2137 int tcp_filter(struct sock *sk, struct sk_buff *skb)
2138 {
2139 struct tcphdr *th = (struct tcphdr *)skb->data;
2140
2141 return sk_filter_trim_cap(sk, skb, th->doff * 4);
2142 }
2143 EXPORT_SYMBOL(tcp_filter);
2144
2145 static void tcp_v4_restore_cb(struct sk_buff *skb)
2146 {
2147 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2148 sizeof(struct inet_skb_parm));
2149 }
2150
2151 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2152 const struct tcphdr *th)
2153 {
2154 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2155 * barrier() makes sure compiler wont play fool^Waliasing games.
2156 */
2157 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2158 sizeof(struct inet_skb_parm));
2159 barrier();
2160
2161 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2162 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2163 skb->len - th->doff * 4);
2164 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2165 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
2166 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2167 TCP_SKB_CB(skb)->sacked = 0;
2168 TCP_SKB_CB(skb)->has_rxtstamp =
2169 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2170 }
2171
2172 /*
2173 * From tcp_input.c
2174 */
2175
2176 int tcp_v4_rcv(struct sk_buff *skb)
2177 {
2178 struct net *net = dev_net(skb->dev);
2179 enum skb_drop_reason drop_reason;
2180 int sdif = inet_sdif(skb);
2181 int dif = inet_iif(skb);
2182 const struct iphdr *iph;
2183 const struct tcphdr *th;
2184 struct sock *sk = NULL;
2185 bool refcounted;
2186 int ret;
2187 u32 isn;
2188
2189 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2190 if (skb->pkt_type != PACKET_HOST)
2191 goto discard_it;
2192
2193 /* Count it even if it's bad */
2194 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2195
2196 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2197 goto discard_it;
2198
2199 th = (const struct tcphdr *)skb->data;
2200
2201 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2202 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2203 goto bad_packet;
2204 }
2205 if (!pskb_may_pull(skb, th->doff * 4))
2206 goto discard_it;
2207
2208 /* An explanation is required here, I think.
2209 * Packet length and doff are validated by header prediction,
2210 * provided case of th->doff==0 is eliminated.
2211 * So, we defer the checks. */
2212
2213 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2214 goto csum_error;
2215
2216 th = (const struct tcphdr *)skb->data;
2217 iph = ip_hdr(skb);
2218 lookup:
2219 sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2220 skb, __tcp_hdrlen(th), th->source,
2221 th->dest, sdif, &refcounted);
2222 if (!sk)
2223 goto no_tcp_socket;
2224
2225 if (sk->sk_state == TCP_TIME_WAIT)
2226 goto do_time_wait;
2227
2228 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2229 struct request_sock *req = inet_reqsk(sk);
2230 bool req_stolen = false;
2231 struct sock *nsk;
2232
2233 sk = req->rsk_listener;
2234 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2235 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2236 else
2237 drop_reason = tcp_inbound_hash(sk, req, skb,
2238 &iph->saddr, &iph->daddr,
2239 AF_INET, dif, sdif);
2240 if (unlikely(drop_reason)) {
2241 sk_drops_add(sk, skb);
2242 reqsk_put(req);
2243 goto discard_it;
2244 }
2245 if (tcp_checksum_complete(skb)) {
2246 reqsk_put(req);
2247 goto csum_error;
2248 }
2249 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2250 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2251 if (!nsk) {
2252 inet_csk_reqsk_queue_drop_and_put(sk, req);
2253 goto lookup;
2254 }
2255 sk = nsk;
2256 /* reuseport_migrate_sock() has already held one sk_refcnt
2257 * before returning.
2258 */
2259 } else {
2260 /* We own a reference on the listener, increase it again
2261 * as we might lose it too soon.
2262 */
2263 sock_hold(sk);
2264 }
2265 refcounted = true;
2266 nsk = NULL;
2267 if (!tcp_filter(sk, skb)) {
2268 th = (const struct tcphdr *)skb->data;
2269 iph = ip_hdr(skb);
2270 tcp_v4_fill_cb(skb, iph, th);
2271 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2272 } else {
2273 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2274 }
2275 if (!nsk) {
2276 reqsk_put(req);
2277 if (req_stolen) {
2278 /* Another cpu got exclusive access to req
2279 * and created a full blown socket.
2280 * Try to feed this packet to this socket
2281 * instead of discarding it.
2282 */
2283 tcp_v4_restore_cb(skb);
2284 sock_put(sk);
2285 goto lookup;
2286 }
2287 goto discard_and_relse;
2288 }
2289 nf_reset_ct(skb);
2290 if (nsk == sk) {
2291 reqsk_put(req);
2292 tcp_v4_restore_cb(skb);
2293 } else {
2294 drop_reason = tcp_child_process(sk, nsk, skb);
2295 if (drop_reason) {
2296 enum sk_rst_reason rst_reason;
2297
2298 rst_reason = sk_rst_convert_drop_reason(drop_reason);
2299 tcp_v4_send_reset(nsk, skb, rst_reason);
2300 goto discard_and_relse;
2301 }
2302 sock_put(sk);
2303 return 0;
2304 }
2305 }
2306
2307 process:
2308 if (static_branch_unlikely(&ip4_min_ttl)) {
2309 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2310 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2311 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2312 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2313 goto discard_and_relse;
2314 }
2315 }
2316
2317 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2318 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2319 goto discard_and_relse;
2320 }
2321
2322 drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2323 AF_INET, dif, sdif);
2324 if (drop_reason)
2325 goto discard_and_relse;
2326
2327 nf_reset_ct(skb);
2328
2329 if (tcp_filter(sk, skb)) {
2330 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2331 goto discard_and_relse;
2332 }
2333 th = (const struct tcphdr *)skb->data;
2334 iph = ip_hdr(skb);
2335 tcp_v4_fill_cb(skb, iph, th);
2336
2337 skb->dev = NULL;
2338
2339 if (sk->sk_state == TCP_LISTEN) {
2340 ret = tcp_v4_do_rcv(sk, skb);
2341 goto put_and_return;
2342 }
2343
2344 sk_incoming_cpu_update(sk);
2345
2346 bh_lock_sock_nested(sk);
2347 tcp_segs_in(tcp_sk(sk), skb);
2348 ret = 0;
2349 if (!sock_owned_by_user(sk)) {
2350 ret = tcp_v4_do_rcv(sk, skb);
2351 } else {
2352 if (tcp_add_backlog(sk, skb, &drop_reason))
2353 goto discard_and_relse;
2354 }
2355 bh_unlock_sock(sk);
2356
2357 put_and_return:
2358 if (refcounted)
2359 sock_put(sk);
2360
2361 return ret;
2362
2363 no_tcp_socket:
2364 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2365 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2366 goto discard_it;
2367
2368 tcp_v4_fill_cb(skb, iph, th);
2369
2370 if (tcp_checksum_complete(skb)) {
2371 csum_error:
2372 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2373 trace_tcp_bad_csum(skb);
2374 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2375 bad_packet:
2376 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2377 } else {
2378 tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2379 }
2380
2381 discard_it:
2382 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2383 /* Discard frame. */
2384 sk_skb_reason_drop(sk, skb, drop_reason);
2385 return 0;
2386
2387 discard_and_relse:
2388 sk_drops_add(sk, skb);
2389 if (refcounted)
2390 sock_put(sk);
2391 goto discard_it;
2392
2393 do_time_wait:
2394 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2395 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2396 inet_twsk_put(inet_twsk(sk));
2397 goto discard_it;
2398 }
2399
2400 tcp_v4_fill_cb(skb, iph, th);
2401
2402 if (tcp_checksum_complete(skb)) {
2403 inet_twsk_put(inet_twsk(sk));
2404 goto csum_error;
2405 }
2406 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
2407 case TCP_TW_SYN: {
2408 struct sock *sk2 = inet_lookup_listener(net,
2409 net->ipv4.tcp_death_row.hashinfo,
2410 skb, __tcp_hdrlen(th),
2411 iph->saddr, th->source,
2412 iph->daddr, th->dest,
2413 inet_iif(skb),
2414 sdif);
2415 if (sk2) {
2416 inet_twsk_deschedule_put(inet_twsk(sk));
2417 sk = sk2;
2418 tcp_v4_restore_cb(skb);
2419 refcounted = false;
2420 __this_cpu_write(tcp_tw_isn, isn);
2421 goto process;
2422 }
2423 }
2424 /* to ACK */
2425 fallthrough;
2426 case TCP_TW_ACK:
2427 tcp_v4_timewait_ack(sk, skb);
2428 break;
2429 case TCP_TW_RST:
2430 tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2431 inet_twsk_deschedule_put(inet_twsk(sk));
2432 goto discard_it;
2433 case TCP_TW_SUCCESS:;
2434 }
2435 goto discard_it;
2436 }
2437
2438 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2439 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2440 .twsk_destructor= tcp_twsk_destructor,
2441 };
2442
2443 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2444 {
2445 struct dst_entry *dst = skb_dst(skb);
2446
2447 if (dst && dst_hold_safe(dst)) {
2448 rcu_assign_pointer(sk->sk_rx_dst, dst);
2449 sk->sk_rx_dst_ifindex = skb->skb_iif;
2450 }
2451 }
2452 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2453
2454 const struct inet_connection_sock_af_ops ipv4_specific = {
2455 .queue_xmit = ip_queue_xmit,
2456 .send_check = tcp_v4_send_check,
2457 .rebuild_header = inet_sk_rebuild_header,
2458 .sk_rx_dst_set = inet_sk_rx_dst_set,
2459 .conn_request = tcp_v4_conn_request,
2460 .syn_recv_sock = tcp_v4_syn_recv_sock,
2461 .net_header_len = sizeof(struct iphdr),
2462 .setsockopt = ip_setsockopt,
2463 .getsockopt = ip_getsockopt,
2464 .addr2sockaddr = inet_csk_addr2sockaddr,
2465 .sockaddr_len = sizeof(struct sockaddr_in),
2466 .mtu_reduced = tcp_v4_mtu_reduced,
2467 };
2468 EXPORT_SYMBOL(ipv4_specific);
2469
2470 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2471 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2472 #ifdef CONFIG_TCP_MD5SIG
2473 .md5_lookup = tcp_v4_md5_lookup,
2474 .calc_md5_hash = tcp_v4_md5_hash_skb,
2475 .md5_parse = tcp_v4_parse_md5_keys,
2476 #endif
2477 #ifdef CONFIG_TCP_AO
2478 .ao_lookup = tcp_v4_ao_lookup,
2479 .calc_ao_hash = tcp_v4_ao_hash_skb,
2480 .ao_parse = tcp_v4_parse_ao,
2481 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2482 #endif
2483 };
2484 #endif
2485
2486 /* NOTE: A lot of things set to zero explicitly by call to
2487 * sk_alloc() so need not be done here.
2488 */
2489 static int tcp_v4_init_sock(struct sock *sk)
2490 {
2491 struct inet_connection_sock *icsk = inet_csk(sk);
2492
2493 tcp_init_sock(sk);
2494
2495 icsk->icsk_af_ops = &ipv4_specific;
2496
2497 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2498 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2499 #endif
2500
2501 return 0;
2502 }
2503
2504 #ifdef CONFIG_TCP_MD5SIG
2505 static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2506 {
2507 struct tcp_md5sig_info *md5sig;
2508
2509 md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2510 kfree(md5sig);
2511 static_branch_slow_dec_deferred(&tcp_md5_needed);
2512 tcp_md5_release_sigpool();
2513 }
2514 #endif
2515
2516 static void tcp_release_user_frags(struct sock *sk)
2517 {
2518 #ifdef CONFIG_PAGE_POOL
2519 unsigned long index;
2520 void *netmem;
2521
2522 xa_for_each(&sk->sk_user_frags, index, netmem)
2523 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2524 #endif
2525 }
2526
2527 void tcp_v4_destroy_sock(struct sock *sk)
2528 {
2529 struct tcp_sock *tp = tcp_sk(sk);
2530
2531 tcp_release_user_frags(sk);
2532
2533 xa_destroy(&sk->sk_user_frags);
2534
2535 trace_tcp_destroy_sock(sk);
2536
2537 tcp_clear_xmit_timers(sk);
2538
2539 tcp_cleanup_congestion_control(sk);
2540
2541 tcp_cleanup_ulp(sk);
2542
2543 /* Cleanup up the write buffer. */
2544 tcp_write_queue_purge(sk);
2545
2546 /* Check if we want to disable active TFO */
2547 tcp_fastopen_active_disable_ofo_check(sk);
2548
2549 /* Cleans up our, hopefully empty, out_of_order_queue. */
2550 skb_rbtree_purge(&tp->out_of_order_queue);
2551
2552 #ifdef CONFIG_TCP_MD5SIG
2553 /* Clean up the MD5 key list, if any */
2554 if (tp->md5sig_info) {
2555 struct tcp_md5sig_info *md5sig;
2556
2557 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2558 tcp_clear_md5_list(sk);
2559 call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
2560 rcu_assign_pointer(tp->md5sig_info, NULL);
2561 }
2562 #endif
2563 tcp_ao_destroy_sock(sk, false);
2564
2565 /* Clean up a referenced TCP bind bucket. */
2566 if (inet_csk(sk)->icsk_bind_hash)
2567 inet_put_port(sk);
2568
2569 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2570
2571 /* If socket is aborted during connect operation */
2572 tcp_free_fastopen_req(tp);
2573 tcp_fastopen_destroy_cipher(sk);
2574 tcp_saved_syn_free(tp);
2575
2576 sk_sockets_allocated_dec(sk);
2577 }
2578 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2579
2580 #ifdef CONFIG_PROC_FS
2581 /* Proc filesystem TCP sock list dumping. */
2582
2583 static unsigned short seq_file_family(const struct seq_file *seq);
2584
2585 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2586 {
2587 unsigned short family = seq_file_family(seq);
2588
2589 /* AF_UNSPEC is used as a match all */
2590 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2591 net_eq(sock_net(sk), seq_file_net(seq)));
2592 }
2593
2594 /* Find a non empty bucket (starting from st->bucket)
2595 * and return the first sk from it.
2596 */
2597 static void *listening_get_first(struct seq_file *seq)
2598 {
2599 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2600 struct tcp_iter_state *st = seq->private;
2601
2602 st->offset = 0;
2603 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2604 struct inet_listen_hashbucket *ilb2;
2605 struct hlist_nulls_node *node;
2606 struct sock *sk;
2607
2608 ilb2 = &hinfo->lhash2[st->bucket];
2609 if (hlist_nulls_empty(&ilb2->nulls_head))
2610 continue;
2611
2612 spin_lock(&ilb2->lock);
2613 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2614 if (seq_sk_match(seq, sk))
2615 return sk;
2616 }
2617 spin_unlock(&ilb2->lock);
2618 }
2619
2620 return NULL;
2621 }
2622
2623 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2624 * If "cur" is the last one in the st->bucket,
2625 * call listening_get_first() to return the first sk of the next
2626 * non empty bucket.
2627 */
2628 static void *listening_get_next(struct seq_file *seq, void *cur)
2629 {
2630 struct tcp_iter_state *st = seq->private;
2631 struct inet_listen_hashbucket *ilb2;
2632 struct hlist_nulls_node *node;
2633 struct inet_hashinfo *hinfo;
2634 struct sock *sk = cur;
2635
2636 ++st->num;
2637 ++st->offset;
2638
2639 sk = sk_nulls_next(sk);
2640 sk_nulls_for_each_from(sk, node) {
2641 if (seq_sk_match(seq, sk))
2642 return sk;
2643 }
2644
2645 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2646 ilb2 = &hinfo->lhash2[st->bucket];
2647 spin_unlock(&ilb2->lock);
2648 ++st->bucket;
2649 return listening_get_first(seq);
2650 }
2651
2652 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2653 {
2654 struct tcp_iter_state *st = seq->private;
2655 void *rc;
2656
2657 st->bucket = 0;
2658 st->offset = 0;
2659 rc = listening_get_first(seq);
2660
2661 while (rc && *pos) {
2662 rc = listening_get_next(seq, rc);
2663 --*pos;
2664 }
2665 return rc;
2666 }
2667
2668 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2669 const struct tcp_iter_state *st)
2670 {
2671 return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2672 }
2673
2674 /*
2675 * Get first established socket starting from bucket given in st->bucket.
2676 * If st->bucket is zero, the very first socket in the hash is returned.
2677 */
2678 static void *established_get_first(struct seq_file *seq)
2679 {
2680 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2681 struct tcp_iter_state *st = seq->private;
2682
2683 st->offset = 0;
2684 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2685 struct sock *sk;
2686 struct hlist_nulls_node *node;
2687 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2688
2689 cond_resched();
2690
2691 /* Lockless fast path for the common case of empty buckets */
2692 if (empty_bucket(hinfo, st))
2693 continue;
2694
2695 spin_lock_bh(lock);
2696 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2697 if (seq_sk_match(seq, sk))
2698 return sk;
2699 }
2700 spin_unlock_bh(lock);
2701 }
2702
2703 return NULL;
2704 }
2705
2706 static void *established_get_next(struct seq_file *seq, void *cur)
2707 {
2708 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2709 struct tcp_iter_state *st = seq->private;
2710 struct hlist_nulls_node *node;
2711 struct sock *sk = cur;
2712
2713 ++st->num;
2714 ++st->offset;
2715
2716 sk = sk_nulls_next(sk);
2717
2718 sk_nulls_for_each_from(sk, node) {
2719 if (seq_sk_match(seq, sk))
2720 return sk;
2721 }
2722
2723 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2724 ++st->bucket;
2725 return established_get_first(seq);
2726 }
2727
2728 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2729 {
2730 struct tcp_iter_state *st = seq->private;
2731 void *rc;
2732
2733 st->bucket = 0;
2734 rc = established_get_first(seq);
2735
2736 while (rc && pos) {
2737 rc = established_get_next(seq, rc);
2738 --pos;
2739 }
2740 return rc;
2741 }
2742
2743 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2744 {
2745 void *rc;
2746 struct tcp_iter_state *st = seq->private;
2747
2748 st->state = TCP_SEQ_STATE_LISTENING;
2749 rc = listening_get_idx(seq, &pos);
2750
2751 if (!rc) {
2752 st->state = TCP_SEQ_STATE_ESTABLISHED;
2753 rc = established_get_idx(seq, pos);
2754 }
2755
2756 return rc;
2757 }
2758
2759 static void *tcp_seek_last_pos(struct seq_file *seq)
2760 {
2761 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2762 struct tcp_iter_state *st = seq->private;
2763 int bucket = st->bucket;
2764 int offset = st->offset;
2765 int orig_num = st->num;
2766 void *rc = NULL;
2767
2768 switch (st->state) {
2769 case TCP_SEQ_STATE_LISTENING:
2770 if (st->bucket > hinfo->lhash2_mask)
2771 break;
2772 rc = listening_get_first(seq);
2773 while (offset-- && rc && bucket == st->bucket)
2774 rc = listening_get_next(seq, rc);
2775 if (rc)
2776 break;
2777 st->bucket = 0;
2778 st->state = TCP_SEQ_STATE_ESTABLISHED;
2779 fallthrough;
2780 case TCP_SEQ_STATE_ESTABLISHED:
2781 if (st->bucket > hinfo->ehash_mask)
2782 break;
2783 rc = established_get_first(seq);
2784 while (offset-- && rc && bucket == st->bucket)
2785 rc = established_get_next(seq, rc);
2786 }
2787
2788 st->num = orig_num;
2789
2790 return rc;
2791 }
2792
2793 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2794 {
2795 struct tcp_iter_state *st = seq->private;
2796 void *rc;
2797
2798 if (*pos && *pos == st->last_pos) {
2799 rc = tcp_seek_last_pos(seq);
2800 if (rc)
2801 goto out;
2802 }
2803
2804 st->state = TCP_SEQ_STATE_LISTENING;
2805 st->num = 0;
2806 st->bucket = 0;
2807 st->offset = 0;
2808 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2809
2810 out:
2811 st->last_pos = *pos;
2812 return rc;
2813 }
2814 EXPORT_SYMBOL(tcp_seq_start);
2815
2816 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2817 {
2818 struct tcp_iter_state *st = seq->private;
2819 void *rc = NULL;
2820
2821 if (v == SEQ_START_TOKEN) {
2822 rc = tcp_get_idx(seq, 0);
2823 goto out;
2824 }
2825
2826 switch (st->state) {
2827 case TCP_SEQ_STATE_LISTENING:
2828 rc = listening_get_next(seq, v);
2829 if (!rc) {
2830 st->state = TCP_SEQ_STATE_ESTABLISHED;
2831 st->bucket = 0;
2832 st->offset = 0;
2833 rc = established_get_first(seq);
2834 }
2835 break;
2836 case TCP_SEQ_STATE_ESTABLISHED:
2837 rc = established_get_next(seq, v);
2838 break;
2839 }
2840 out:
2841 ++*pos;
2842 st->last_pos = *pos;
2843 return rc;
2844 }
2845 EXPORT_SYMBOL(tcp_seq_next);
2846
2847 void tcp_seq_stop(struct seq_file *seq, void *v)
2848 {
2849 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2850 struct tcp_iter_state *st = seq->private;
2851
2852 switch (st->state) {
2853 case TCP_SEQ_STATE_LISTENING:
2854 if (v != SEQ_START_TOKEN)
2855 spin_unlock(&hinfo->lhash2[st->bucket].lock);
2856 break;
2857 case TCP_SEQ_STATE_ESTABLISHED:
2858 if (v)
2859 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2860 break;
2861 }
2862 }
2863 EXPORT_SYMBOL(tcp_seq_stop);
2864
2865 static void get_openreq4(const struct request_sock *req,
2866 struct seq_file *f, int i)
2867 {
2868 const struct inet_request_sock *ireq = inet_rsk(req);
2869 long delta = req->rsk_timer.expires - jiffies;
2870
2871 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2872 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2873 i,
2874 ireq->ir_loc_addr,
2875 ireq->ir_num,
2876 ireq->ir_rmt_addr,
2877 ntohs(ireq->ir_rmt_port),
2878 TCP_SYN_RECV,
2879 0, 0, /* could print option size, but that is af dependent. */
2880 1, /* timers active (only the expire timer) */
2881 jiffies_delta_to_clock_t(delta),
2882 req->num_timeout,
2883 from_kuid_munged(seq_user_ns(f),
2884 sock_i_uid(req->rsk_listener)),
2885 0, /* non standard timer */
2886 0, /* open_requests have no inode */
2887 0,
2888 req);
2889 }
2890
2891 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2892 {
2893 int timer_active;
2894 unsigned long timer_expires;
2895 const struct tcp_sock *tp = tcp_sk(sk);
2896 const struct inet_connection_sock *icsk = inet_csk(sk);
2897 const struct inet_sock *inet = inet_sk(sk);
2898 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2899 __be32 dest = inet->inet_daddr;
2900 __be32 src = inet->inet_rcv_saddr;
2901 __u16 destp = ntohs(inet->inet_dport);
2902 __u16 srcp = ntohs(inet->inet_sport);
2903 int rx_queue;
2904 int state;
2905
2906 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2907 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2908 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2909 timer_active = 1;
2910 timer_expires = icsk->icsk_timeout;
2911 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2912 timer_active = 4;
2913 timer_expires = icsk->icsk_timeout;
2914 } else if (timer_pending(&sk->sk_timer)) {
2915 timer_active = 2;
2916 timer_expires = sk->sk_timer.expires;
2917 } else {
2918 timer_active = 0;
2919 timer_expires = jiffies;
2920 }
2921
2922 state = inet_sk_state_load(sk);
2923 if (state == TCP_LISTEN)
2924 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2925 else
2926 /* Because we don't lock the socket,
2927 * we might find a transient negative value.
2928 */
2929 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2930 READ_ONCE(tp->copied_seq), 0);
2931
2932 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2933 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2934 i, src, srcp, dest, destp, state,
2935 READ_ONCE(tp->write_seq) - tp->snd_una,
2936 rx_queue,
2937 timer_active,
2938 jiffies_delta_to_clock_t(timer_expires - jiffies),
2939 icsk->icsk_retransmits,
2940 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2941 icsk->icsk_probes_out,
2942 sock_i_ino(sk),
2943 refcount_read(&sk->sk_refcnt), sk,
2944 jiffies_to_clock_t(icsk->icsk_rto),
2945 jiffies_to_clock_t(icsk->icsk_ack.ato),
2946 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2947 tcp_snd_cwnd(tp),
2948 state == TCP_LISTEN ?
2949 fastopenq->max_qlen :
2950 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2951 }
2952
2953 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2954 struct seq_file *f, int i)
2955 {
2956 long delta = tw->tw_timer.expires - jiffies;
2957 __be32 dest, src;
2958 __u16 destp, srcp;
2959
2960 dest = tw->tw_daddr;
2961 src = tw->tw_rcv_saddr;
2962 destp = ntohs(tw->tw_dport);
2963 srcp = ntohs(tw->tw_sport);
2964
2965 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2966 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2967 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2968 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2969 refcount_read(&tw->tw_refcnt), tw);
2970 }
2971
2972 #define TMPSZ 150
2973
2974 static int tcp4_seq_show(struct seq_file *seq, void *v)
2975 {
2976 struct tcp_iter_state *st;
2977 struct sock *sk = v;
2978
2979 seq_setwidth(seq, TMPSZ - 1);
2980 if (v == SEQ_START_TOKEN) {
2981 seq_puts(seq, " sl local_address rem_address st tx_queue "
2982 "rx_queue tr tm->when retrnsmt uid timeout "
2983 "inode");
2984 goto out;
2985 }
2986 st = seq->private;
2987
2988 if (sk->sk_state == TCP_TIME_WAIT)
2989 get_timewait4_sock(v, seq, st->num);
2990 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2991 get_openreq4(v, seq, st->num);
2992 else
2993 get_tcp4_sock(v, seq, st->num);
2994 out:
2995 seq_pad(seq, '\n');
2996 return 0;
2997 }
2998
2999 #ifdef CONFIG_BPF_SYSCALL
3000 struct bpf_tcp_iter_state {
3001 struct tcp_iter_state state;
3002 unsigned int cur_sk;
3003 unsigned int end_sk;
3004 unsigned int max_sk;
3005 struct sock **batch;
3006 bool st_bucket_done;
3007 };
3008
3009 struct bpf_iter__tcp {
3010 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3011 __bpf_md_ptr(struct sock_common *, sk_common);
3012 uid_t uid __aligned(8);
3013 };
3014
3015 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3016 struct sock_common *sk_common, uid_t uid)
3017 {
3018 struct bpf_iter__tcp ctx;
3019
3020 meta->seq_num--; /* skip SEQ_START_TOKEN */
3021 ctx.meta = meta;
3022 ctx.sk_common = sk_common;
3023 ctx.uid = uid;
3024 return bpf_iter_run_prog(prog, &ctx);
3025 }
3026
3027 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3028 {
3029 while (iter->cur_sk < iter->end_sk)
3030 sock_gen_put(iter->batch[iter->cur_sk++]);
3031 }
3032
3033 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3034 unsigned int new_batch_sz)
3035 {
3036 struct sock **new_batch;
3037
3038 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3039 GFP_USER | __GFP_NOWARN);
3040 if (!new_batch)
3041 return -ENOMEM;
3042
3043 bpf_iter_tcp_put_batch(iter);
3044 kvfree(iter->batch);
3045 iter->batch = new_batch;
3046 iter->max_sk = new_batch_sz;
3047
3048 return 0;
3049 }
3050
3051 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3052 struct sock *start_sk)
3053 {
3054 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3055 struct bpf_tcp_iter_state *iter = seq->private;
3056 struct tcp_iter_state *st = &iter->state;
3057 struct hlist_nulls_node *node;
3058 unsigned int expected = 1;
3059 struct sock *sk;
3060
3061 sock_hold(start_sk);
3062 iter->batch[iter->end_sk++] = start_sk;
3063
3064 sk = sk_nulls_next(start_sk);
3065 sk_nulls_for_each_from(sk, node) {
3066 if (seq_sk_match(seq, sk)) {
3067 if (iter->end_sk < iter->max_sk) {
3068 sock_hold(sk);
3069 iter->batch[iter->end_sk++] = sk;
3070 }
3071 expected++;
3072 }
3073 }
3074 spin_unlock(&hinfo->lhash2[st->bucket].lock);
3075
3076 return expected;
3077 }
3078
3079 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3080 struct sock *start_sk)
3081 {
3082 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3083 struct bpf_tcp_iter_state *iter = seq->private;
3084 struct tcp_iter_state *st = &iter->state;
3085 struct hlist_nulls_node *node;
3086 unsigned int expected = 1;
3087 struct sock *sk;
3088
3089 sock_hold(start_sk);
3090 iter->batch[iter->end_sk++] = start_sk;
3091
3092 sk = sk_nulls_next(start_sk);
3093 sk_nulls_for_each_from(sk, node) {
3094 if (seq_sk_match(seq, sk)) {
3095 if (iter->end_sk < iter->max_sk) {
3096 sock_hold(sk);
3097 iter->batch[iter->end_sk++] = sk;
3098 }
3099 expected++;
3100 }
3101 }
3102 spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3103
3104 return expected;
3105 }
3106
3107 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3108 {
3109 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3110 struct bpf_tcp_iter_state *iter = seq->private;
3111 struct tcp_iter_state *st = &iter->state;
3112 unsigned int expected;
3113 bool resized = false;
3114 struct sock *sk;
3115
3116 /* The st->bucket is done. Directly advance to the next
3117 * bucket instead of having the tcp_seek_last_pos() to skip
3118 * one by one in the current bucket and eventually find out
3119 * it has to advance to the next bucket.
3120 */
3121 if (iter->st_bucket_done) {
3122 st->offset = 0;
3123 st->bucket++;
3124 if (st->state == TCP_SEQ_STATE_LISTENING &&
3125 st->bucket > hinfo->lhash2_mask) {
3126 st->state = TCP_SEQ_STATE_ESTABLISHED;
3127 st->bucket = 0;
3128 }
3129 }
3130
3131 again:
3132 /* Get a new batch */
3133 iter->cur_sk = 0;
3134 iter->end_sk = 0;
3135 iter->st_bucket_done = false;
3136
3137 sk = tcp_seek_last_pos(seq);
3138 if (!sk)
3139 return NULL; /* Done */
3140
3141 if (st->state == TCP_SEQ_STATE_LISTENING)
3142 expected = bpf_iter_tcp_listening_batch(seq, sk);
3143 else
3144 expected = bpf_iter_tcp_established_batch(seq, sk);
3145
3146 if (iter->end_sk == expected) {
3147 iter->st_bucket_done = true;
3148 return sk;
3149 }
3150
3151 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
3152 resized = true;
3153 goto again;
3154 }
3155
3156 return sk;
3157 }
3158
3159 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3160 {
3161 /* bpf iter does not support lseek, so it always
3162 * continue from where it was stop()-ped.
3163 */
3164 if (*pos)
3165 return bpf_iter_tcp_batch(seq);
3166
3167 return SEQ_START_TOKEN;
3168 }
3169
3170 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3171 {
3172 struct bpf_tcp_iter_state *iter = seq->private;
3173 struct tcp_iter_state *st = &iter->state;
3174 struct sock *sk;
3175
3176 /* Whenever seq_next() is called, the iter->cur_sk is
3177 * done with seq_show(), so advance to the next sk in
3178 * the batch.
3179 */
3180 if (iter->cur_sk < iter->end_sk) {
3181 /* Keeping st->num consistent in tcp_iter_state.
3182 * bpf_iter_tcp does not use st->num.
3183 * meta.seq_num is used instead.
3184 */
3185 st->num++;
3186 /* Move st->offset to the next sk in the bucket such that
3187 * the future start() will resume at st->offset in
3188 * st->bucket. See tcp_seek_last_pos().
3189 */
3190 st->offset++;
3191 sock_gen_put(iter->batch[iter->cur_sk++]);
3192 }
3193
3194 if (iter->cur_sk < iter->end_sk)
3195 sk = iter->batch[iter->cur_sk];
3196 else
3197 sk = bpf_iter_tcp_batch(seq);
3198
3199 ++*pos;
3200 /* Keeping st->last_pos consistent in tcp_iter_state.
3201 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3202 */
3203 st->last_pos = *pos;
3204 return sk;
3205 }
3206
3207 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3208 {
3209 struct bpf_iter_meta meta;
3210 struct bpf_prog *prog;
3211 struct sock *sk = v;
3212 uid_t uid;
3213 int ret;
3214
3215 if (v == SEQ_START_TOKEN)
3216 return 0;
3217
3218 if (sk_fullsock(sk))
3219 lock_sock(sk);
3220
3221 if (unlikely(sk_unhashed(sk))) {
3222 ret = SEQ_SKIP;
3223 goto unlock;
3224 }
3225
3226 if (sk->sk_state == TCP_TIME_WAIT) {
3227 uid = 0;
3228 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3229 const struct request_sock *req = v;
3230
3231 uid = from_kuid_munged(seq_user_ns(seq),
3232 sock_i_uid(req->rsk_listener));
3233 } else {
3234 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3235 }
3236
3237 meta.seq = seq;
3238 prog = bpf_iter_get_info(&meta, false);
3239 ret = tcp_prog_seq_show(prog, &meta, v, uid);
3240
3241 unlock:
3242 if (sk_fullsock(sk))
3243 release_sock(sk);
3244 return ret;
3245
3246 }
3247
3248 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3249 {
3250 struct bpf_tcp_iter_state *iter = seq->private;
3251 struct bpf_iter_meta meta;
3252 struct bpf_prog *prog;
3253
3254 if (!v) {
3255 meta.seq = seq;
3256 prog = bpf_iter_get_info(&meta, true);
3257 if (prog)
3258 (void)tcp_prog_seq_show(prog, &meta, v, 0);
3259 }
3260
3261 if (iter->cur_sk < iter->end_sk) {
3262 bpf_iter_tcp_put_batch(iter);
3263 iter->st_bucket_done = false;
3264 }
3265 }
3266
3267 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3268 .show = bpf_iter_tcp_seq_show,
3269 .start = bpf_iter_tcp_seq_start,
3270 .next = bpf_iter_tcp_seq_next,
3271 .stop = bpf_iter_tcp_seq_stop,
3272 };
3273 #endif
3274 static unsigned short seq_file_family(const struct seq_file *seq)
3275 {
3276 const struct tcp_seq_afinfo *afinfo;
3277
3278 #ifdef CONFIG_BPF_SYSCALL
3279 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3280 if (seq->op == &bpf_iter_tcp_seq_ops)
3281 return AF_UNSPEC;
3282 #endif
3283
3284 /* Iterated from proc fs */
3285 afinfo = pde_data(file_inode(seq->file));
3286 return afinfo->family;
3287 }
3288
3289 static const struct seq_operations tcp4_seq_ops = {
3290 .show = tcp4_seq_show,
3291 .start = tcp_seq_start,
3292 .next = tcp_seq_next,
3293 .stop = tcp_seq_stop,
3294 };
3295
3296 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3297 .family = AF_INET,
3298 };
3299
3300 static int __net_init tcp4_proc_init_net(struct net *net)
3301 {
3302 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3303 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3304 return -ENOMEM;
3305 return 0;
3306 }
3307
3308 static void __net_exit tcp4_proc_exit_net(struct net *net)
3309 {
3310 remove_proc_entry("tcp", net->proc_net);
3311 }
3312
3313 static struct pernet_operations tcp4_net_ops = {
3314 .init = tcp4_proc_init_net,
3315 .exit = tcp4_proc_exit_net,
3316 };
3317
3318 int __init tcp4_proc_init(void)
3319 {
3320 return register_pernet_subsys(&tcp4_net_ops);
3321 }
3322
3323 void tcp4_proc_exit(void)
3324 {
3325 unregister_pernet_subsys(&tcp4_net_ops);
3326 }
3327 #endif /* CONFIG_PROC_FS */
3328
3329 /* @wake is one when sk_stream_write_space() calls us.
3330 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3331 * This mimics the strategy used in sock_def_write_space().
3332 */
3333 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3334 {
3335 const struct tcp_sock *tp = tcp_sk(sk);
3336 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3337 READ_ONCE(tp->snd_nxt);
3338
3339 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3340 }
3341 EXPORT_SYMBOL(tcp_stream_memory_free);
3342
3343 struct proto tcp_prot = {
3344 .name = "TCP",
3345 .owner = THIS_MODULE,
3346 .close = tcp_close,
3347 .pre_connect = tcp_v4_pre_connect,
3348 .connect = tcp_v4_connect,
3349 .disconnect = tcp_disconnect,
3350 .accept = inet_csk_accept,
3351 .ioctl = tcp_ioctl,
3352 .init = tcp_v4_init_sock,
3353 .destroy = tcp_v4_destroy_sock,
3354 .shutdown = tcp_shutdown,
3355 .setsockopt = tcp_setsockopt,
3356 .getsockopt = tcp_getsockopt,
3357 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3358 .keepalive = tcp_set_keepalive,
3359 .recvmsg = tcp_recvmsg,
3360 .sendmsg = tcp_sendmsg,
3361 .splice_eof = tcp_splice_eof,
3362 .backlog_rcv = tcp_v4_do_rcv,
3363 .release_cb = tcp_release_cb,
3364 .hash = inet_hash,
3365 .unhash = inet_unhash,
3366 .get_port = inet_csk_get_port,
3367 .put_port = inet_put_port,
3368 #ifdef CONFIG_BPF_SYSCALL
3369 .psock_update_sk_prot = tcp_bpf_update_proto,
3370 #endif
3371 .enter_memory_pressure = tcp_enter_memory_pressure,
3372 .leave_memory_pressure = tcp_leave_memory_pressure,
3373 .stream_memory_free = tcp_stream_memory_free,
3374 .sockets_allocated = &tcp_sockets_allocated,
3375 .orphan_count = &tcp_orphan_count,
3376
3377 .memory_allocated = &tcp_memory_allocated,
3378 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3379
3380 .memory_pressure = &tcp_memory_pressure,
3381 .sysctl_mem = sysctl_tcp_mem,
3382 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3383 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3384 .max_header = MAX_TCP_HEADER,
3385 .obj_size = sizeof(struct tcp_sock),
3386 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3387 .twsk_prot = &tcp_timewait_sock_ops,
3388 .rsk_prot = &tcp_request_sock_ops,
3389 .h.hashinfo = NULL,
3390 .no_autobind = true,
3391 .diag_destroy = tcp_abort,
3392 };
3393 EXPORT_SYMBOL(tcp_prot);
3394
3395 static void __net_exit tcp_sk_exit(struct net *net)
3396 {
3397 if (net->ipv4.tcp_congestion_control)
3398 bpf_module_put(net->ipv4.tcp_congestion_control,
3399 net->ipv4.tcp_congestion_control->owner);
3400 }
3401
3402 static void __net_init tcp_set_hashinfo(struct net *net)
3403 {
3404 struct inet_hashinfo *hinfo;
3405 unsigned int ehash_entries;
3406 struct net *old_net;
3407
3408 if (net_eq(net, &init_net))
3409 goto fallback;
3410
3411 old_net = current->nsproxy->net_ns;
3412 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3413 if (!ehash_entries)
3414 goto fallback;
3415
3416 ehash_entries = roundup_pow_of_two(ehash_entries);
3417 hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3418 if (!hinfo) {
3419 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3420 "for a netns, fallback to the global one\n",
3421 ehash_entries);
3422 fallback:
3423 hinfo = &tcp_hashinfo;
3424 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3425 }
3426
3427 net->ipv4.tcp_death_row.hashinfo = hinfo;
3428 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3429 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3430 }
3431
3432 static int __net_init tcp_sk_init(struct net *net)
3433 {
3434 net->ipv4.sysctl_tcp_ecn = 2;
3435 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3436
3437 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3438 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3439 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3440 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3441 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3442
3443 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3444 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3445 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3446
3447 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3448 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3449 net->ipv4.sysctl_tcp_syncookies = 1;
3450 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3451 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3452 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3453 net->ipv4.sysctl_tcp_orphan_retries = 0;
3454 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3455 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3456 net->ipv4.sysctl_tcp_tw_reuse = 2;
3457 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3458
3459 refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3460 tcp_set_hashinfo(net);
3461
3462 net->ipv4.sysctl_tcp_sack = 1;
3463 net->ipv4.sysctl_tcp_window_scaling = 1;
3464 net->ipv4.sysctl_tcp_timestamps = 1;
3465 net->ipv4.sysctl_tcp_early_retrans = 3;
3466 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3467 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3468 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3469 net->ipv4.sysctl_tcp_max_reordering = 300;
3470 net->ipv4.sysctl_tcp_dsack = 1;
3471 net->ipv4.sysctl_tcp_app_win = 31;
3472 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3473 net->ipv4.sysctl_tcp_frto = 2;
3474 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3475 /* This limits the percentage of the congestion window which we
3476 * will allow a single TSO frame to consume. Building TSO frames
3477 * which are too large can cause TCP streams to be bursty.
3478 */
3479 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3480 /* Default TSQ limit of 16 TSO segments */
3481 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3482
3483 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3484 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3485
3486 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3487 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3488 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3489 net->ipv4.sysctl_tcp_autocorking = 1;
3490 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3491 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3492 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3493 if (net != &init_net) {
3494 memcpy(net->ipv4.sysctl_tcp_rmem,
3495 init_net.ipv4.sysctl_tcp_rmem,
3496 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3497 memcpy(net->ipv4.sysctl_tcp_wmem,
3498 init_net.ipv4.sysctl_tcp_wmem,
3499 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3500 }
3501 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3502 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3503 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3504 net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3505 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3506 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3507 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3508
3509 /* Set default values for PLB */
3510 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3511 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3512 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3513 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3514 /* Default congestion threshold for PLB to mark a round is 50% */
3515 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3516
3517 /* Reno is always built in */
3518 if (!net_eq(net, &init_net) &&
3519 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3520 init_net.ipv4.tcp_congestion_control->owner))
3521 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3522 else
3523 net->ipv4.tcp_congestion_control = &tcp_reno;
3524
3525 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3526 net->ipv4.sysctl_tcp_shrink_window = 0;
3527
3528 net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3529 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3530
3531 return 0;
3532 }
3533
3534 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3535 {
3536 struct net *net;
3537
3538 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3539 * and failed setup_net error unwinding path are serialized.
3540 *
3541 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3542 * net_exit_list, the thread that dismantles a particular twsk must
3543 * do so without other thread progressing to refcount_dec_and_test() of
3544 * tcp_death_row.tw_refcount.
3545 */
3546 mutex_lock(&tcp_exit_batch_mutex);
3547
3548 tcp_twsk_purge(net_exit_list);
3549
3550 list_for_each_entry(net, net_exit_list, exit_list) {
3551 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3552 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3553 tcp_fastopen_ctx_destroy(net);
3554 }
3555
3556 mutex_unlock(&tcp_exit_batch_mutex);
3557 }
3558
3559 static struct pernet_operations __net_initdata tcp_sk_ops = {
3560 .init = tcp_sk_init,
3561 .exit = tcp_sk_exit,
3562 .exit_batch = tcp_sk_exit_batch,
3563 };
3564
3565 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3566 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3567 struct sock_common *sk_common, uid_t uid)
3568
3569 #define INIT_BATCH_SZ 16
3570
3571 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3572 {
3573 struct bpf_tcp_iter_state *iter = priv_data;
3574 int err;
3575
3576 err = bpf_iter_init_seq_net(priv_data, aux);
3577 if (err)
3578 return err;
3579
3580 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3581 if (err) {
3582 bpf_iter_fini_seq_net(priv_data);
3583 return err;
3584 }
3585
3586 return 0;
3587 }
3588
3589 static void bpf_iter_fini_tcp(void *priv_data)
3590 {
3591 struct bpf_tcp_iter_state *iter = priv_data;
3592
3593 bpf_iter_fini_seq_net(priv_data);
3594 kvfree(iter->batch);
3595 }
3596
3597 static const struct bpf_iter_seq_info tcp_seq_info = {
3598 .seq_ops = &bpf_iter_tcp_seq_ops,
3599 .init_seq_private = bpf_iter_init_tcp,
3600 .fini_seq_private = bpf_iter_fini_tcp,
3601 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3602 };
3603
3604 static const struct bpf_func_proto *
3605 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3606 const struct bpf_prog *prog)
3607 {
3608 switch (func_id) {
3609 case BPF_FUNC_setsockopt:
3610 return &bpf_sk_setsockopt_proto;
3611 case BPF_FUNC_getsockopt:
3612 return &bpf_sk_getsockopt_proto;
3613 default:
3614 return NULL;
3615 }
3616 }
3617
3618 static struct bpf_iter_reg tcp_reg_info = {
3619 .target = "tcp",
3620 .ctx_arg_info_size = 1,
3621 .ctx_arg_info = {
3622 { offsetof(struct bpf_iter__tcp, sk_common),
3623 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3624 },
3625 .get_func_proto = bpf_iter_tcp_get_func_proto,
3626 .seq_info = &tcp_seq_info,
3627 };
3628
3629 static void __init bpf_iter_register(void)
3630 {
3631 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3632 if (bpf_iter_reg_target(&tcp_reg_info))
3633 pr_warn("Warning: could not register bpf iterator tcp\n");
3634 }
3635
3636 #endif
3637
3638 void __init tcp_v4_init(void)
3639 {
3640 int cpu, res;
3641
3642 for_each_possible_cpu(cpu) {
3643 struct sock *sk;
3644
3645 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3646 IPPROTO_TCP, &init_net);
3647 if (res)
3648 panic("Failed to create the TCP control socket.\n");
3649 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3650
3651 /* Please enforce IP_DF and IPID==0 for RST and
3652 * ACK sent in SYN-RECV and TIME-WAIT state.
3653 */
3654 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3655
3656 sk->sk_clockid = CLOCK_MONOTONIC;
3657
3658 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3659 }
3660 if (register_pernet_subsys(&tcp_sk_ops))
3661 panic("Failed to create the TCP control socket.\n");
3662
3663 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3664 bpf_iter_register();
3665 #endif
3666 }
3667