1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3 *
4 * Copyright (c) 2021, Red Hat.
5 */
6
7 #define pr_fmt(fmt) "MPTCP: " fmt
8
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <net/sock.h>
12 #include <net/protocol.h>
13 #include <net/tcp.h>
14 #include <net/mptcp.h>
15 #include "protocol.h"
16
17 #define MIN_INFO_OPTLEN_SIZE 16
18 #define MIN_FULL_INFO_OPTLEN_SIZE 40
19
__mptcp_tcp_fallback(struct mptcp_sock * msk)20 static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
21 {
22 msk_owned_by_me(msk);
23
24 if (likely(!__mptcp_check_fallback(msk)))
25 return NULL;
26
27 return msk->first;
28 }
29
sockopt_seq_reset(const struct sock * sk)30 static u32 sockopt_seq_reset(const struct sock *sk)
31 {
32 sock_owned_by_me(sk);
33
34 /* Highbits contain state. Allows to distinguish sockopt_seq
35 * of listener and established:
36 * s0 = new_listener()
37 * sockopt(s0) - seq is 1
38 * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0)
39 * sockopt(s0) - seq increments to 2 on s0
40 * sockopt(s1) // seq increments to 2 on s1 (different option)
41 * new ssk completes join, inherits options from s0 // seq 2
42 * Needs sync from mptcp join logic, but ssk->seq == msk->seq
43 *
44 * Set High order bits to sk_state so ssk->seq == msk->seq test
45 * will fail.
46 */
47
48 return (u32)sk->sk_state << 24u;
49 }
50
sockopt_seq_inc(struct mptcp_sock * msk)51 static void sockopt_seq_inc(struct mptcp_sock *msk)
52 {
53 u32 seq = (msk->setsockopt_seq + 1) & 0x00ffffff;
54
55 msk->setsockopt_seq = sockopt_seq_reset((struct sock *)msk) + seq;
56 }
57
mptcp_get_int_option(struct mptcp_sock * msk,sockptr_t optval,unsigned int optlen,int * val)58 static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval,
59 unsigned int optlen, int *val)
60 {
61 if (optlen < sizeof(int))
62 return -EINVAL;
63
64 if (copy_from_sockptr(val, optval, sizeof(*val)))
65 return -EFAULT;
66
67 return 0;
68 }
69
__mptcp_subflow_set_rcvbuf(struct sock * ssk,int val)70 static void __mptcp_subflow_set_rcvbuf(struct sock *ssk, int val)
71 {
72 WRITE_ONCE(ssk->sk_rcvbuf, val);
73 tcp_set_rcvbuf(ssk, val);
74 }
75
mptcp_sol_socket_sync_intval(struct mptcp_sock * msk,int optname,int val)76 static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val)
77 {
78 struct mptcp_subflow_context *subflow;
79 struct sock *sk = (struct sock *)msk;
80
81 lock_sock(sk);
82 sockopt_seq_inc(msk);
83
84 mptcp_for_each_subflow(msk, subflow) {
85 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
86 bool slow = lock_sock_fast(ssk);
87
88 switch (optname) {
89 case SO_DEBUG:
90 sock_valbool_flag(ssk, SOCK_DBG, !!val);
91 break;
92 case SO_KEEPALIVE:
93 if (ssk->sk_prot->keepalive)
94 ssk->sk_prot->keepalive(ssk, !!val);
95 sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val);
96 break;
97 case SO_PRIORITY:
98 WRITE_ONCE(ssk->sk_priority, val);
99 break;
100 case SO_SNDBUF:
101 case SO_SNDBUFFORCE:
102 ssk->sk_userlocks |= SOCK_SNDBUF_LOCK;
103 WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
104 mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
105 break;
106 case SO_RCVBUF:
107 case SO_RCVBUFFORCE:
108 ssk->sk_userlocks |= SOCK_RCVBUF_LOCK;
109 __mptcp_subflow_set_rcvbuf(ssk, sk->sk_rcvbuf);
110 break;
111 case SO_MARK:
112 if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) {
113 WRITE_ONCE(ssk->sk_mark, sk->sk_mark);
114 sk_dst_reset(ssk);
115 }
116 break;
117 case SO_INCOMING_CPU:
118 WRITE_ONCE(ssk->sk_incoming_cpu, val);
119 break;
120 }
121
122 subflow->setsockopt_seq = msk->setsockopt_seq;
123 unlock_sock_fast(ssk, slow);
124 }
125
126 release_sock(sk);
127 }
128
mptcp_sol_socket_intval(struct mptcp_sock * msk,int optname,int val)129 static int mptcp_sol_socket_intval(struct mptcp_sock *msk, int optname, int val)
130 {
131 sockptr_t optval = KERNEL_SOCKPTR(&val);
132 struct sock *sk = (struct sock *)msk;
133 int ret;
134
135 ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
136 optval, sizeof(val));
137 if (ret)
138 return ret;
139
140 mptcp_sol_socket_sync_intval(msk, optname, val);
141 return 0;
142 }
143
mptcp_so_incoming_cpu(struct mptcp_sock * msk,int val)144 static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val)
145 {
146 struct sock *sk = (struct sock *)msk;
147
148 WRITE_ONCE(sk->sk_incoming_cpu, val);
149
150 mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val);
151 }
152
mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock * msk,int optname,int val)153 static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optname, int val)
154 {
155 sockptr_t optval = KERNEL_SOCKPTR(&val);
156 struct mptcp_subflow_context *subflow;
157 struct sock *sk = (struct sock *)msk;
158 int ret;
159
160 ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
161 optval, sizeof(val));
162 if (ret)
163 return ret;
164
165 lock_sock(sk);
166 mptcp_for_each_subflow(msk, subflow) {
167 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
168
169 lock_sock(ssk);
170 sock_set_timestamp(ssk, optname, !!val);
171 release_sock(ssk);
172 }
173
174 release_sock(sk);
175 return 0;
176 }
177
mptcp_setsockopt_sol_socket_int(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)178 static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
179 sockptr_t optval,
180 unsigned int optlen)
181 {
182 int val, ret;
183
184 ret = mptcp_get_int_option(msk, optval, optlen, &val);
185 if (ret)
186 return ret;
187
188 switch (optname) {
189 case SO_KEEPALIVE:
190 case SO_DEBUG:
191 case SO_MARK:
192 case SO_PRIORITY:
193 case SO_SNDBUF:
194 case SO_SNDBUFFORCE:
195 case SO_RCVBUF:
196 case SO_RCVBUFFORCE:
197 return mptcp_sol_socket_intval(msk, optname, val);
198 case SO_INCOMING_CPU:
199 mptcp_so_incoming_cpu(msk, val);
200 return 0;
201 case SO_TIMESTAMP_OLD:
202 case SO_TIMESTAMP_NEW:
203 case SO_TIMESTAMPNS_OLD:
204 case SO_TIMESTAMPNS_NEW:
205 return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val);
206 }
207
208 return -ENOPROTOOPT;
209 }
210
mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)211 static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
212 int optname,
213 sockptr_t optval,
214 unsigned int optlen)
215 {
216 struct mptcp_subflow_context *subflow;
217 struct sock *sk = (struct sock *)msk;
218 struct so_timestamping timestamping;
219 int ret;
220
221 if (optlen == sizeof(timestamping)) {
222 if (copy_from_sockptr(×tamping, optval,
223 sizeof(timestamping)))
224 return -EFAULT;
225 } else if (optlen == sizeof(int)) {
226 memset(×tamping, 0, sizeof(timestamping));
227
228 if (copy_from_sockptr(×tamping.flags, optval, sizeof(int)))
229 return -EFAULT;
230 } else {
231 return -EINVAL;
232 }
233
234 ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
235 KERNEL_SOCKPTR(×tamping),
236 sizeof(timestamping));
237 if (ret)
238 return ret;
239
240 lock_sock(sk);
241
242 mptcp_for_each_subflow(msk, subflow) {
243 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
244 int err;
245
246 lock_sock(ssk);
247 err = sock_set_timestamping(ssk, optname, timestamping);
248 release_sock(ssk);
249
250 if (err < 0 && ret == 0)
251 ret = err;
252 }
253
254 release_sock(sk);
255
256 return ret;
257 }
258
mptcp_setsockopt_sol_socket_linger(struct mptcp_sock * msk,sockptr_t optval,unsigned int optlen)259 static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval,
260 unsigned int optlen)
261 {
262 struct mptcp_subflow_context *subflow;
263 struct sock *sk = (struct sock *)msk;
264 struct linger ling;
265 sockptr_t kopt;
266 int ret;
267
268 if (optlen < sizeof(ling))
269 return -EINVAL;
270
271 if (copy_from_sockptr(&ling, optval, sizeof(ling)))
272 return -EFAULT;
273
274 kopt = KERNEL_SOCKPTR(&ling);
275 ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, SO_LINGER, kopt, sizeof(ling));
276 if (ret)
277 return ret;
278
279 lock_sock(sk);
280 sockopt_seq_inc(msk);
281 mptcp_for_each_subflow(msk, subflow) {
282 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
283 bool slow = lock_sock_fast(ssk);
284
285 if (!ling.l_onoff) {
286 sock_reset_flag(ssk, SOCK_LINGER);
287 } else {
288 ssk->sk_lingertime = sk->sk_lingertime;
289 sock_set_flag(ssk, SOCK_LINGER);
290 }
291
292 subflow->setsockopt_seq = msk->setsockopt_seq;
293 unlock_sock_fast(ssk, slow);
294 }
295
296 release_sock(sk);
297 return 0;
298 }
299
mptcp_setsockopt_sol_socket(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)300 static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
301 sockptr_t optval, unsigned int optlen)
302 {
303 struct sock *sk = (struct sock *)msk;
304 struct sock *ssk;
305 int ret;
306
307 switch (optname) {
308 case SO_REUSEPORT:
309 case SO_REUSEADDR:
310 case SO_BINDTODEVICE:
311 case SO_BINDTOIFINDEX:
312 lock_sock(sk);
313 ssk = __mptcp_nmpc_sk(msk);
314 if (IS_ERR(ssk)) {
315 release_sock(sk);
316 return PTR_ERR(ssk);
317 }
318
319 ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen);
320 if (ret == 0) {
321 if (optname == SO_REUSEPORT)
322 sk->sk_reuseport = ssk->sk_reuseport;
323 else if (optname == SO_REUSEADDR)
324 sk->sk_reuse = ssk->sk_reuse;
325 else if (optname == SO_BINDTODEVICE)
326 sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
327 else if (optname == SO_BINDTOIFINDEX)
328 sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
329 }
330 release_sock(sk);
331 return ret;
332 case SO_KEEPALIVE:
333 case SO_PRIORITY:
334 case SO_SNDBUF:
335 case SO_SNDBUFFORCE:
336 case SO_RCVBUF:
337 case SO_RCVBUFFORCE:
338 case SO_MARK:
339 case SO_INCOMING_CPU:
340 case SO_DEBUG:
341 case SO_TIMESTAMP_OLD:
342 case SO_TIMESTAMP_NEW:
343 case SO_TIMESTAMPNS_OLD:
344 case SO_TIMESTAMPNS_NEW:
345 return mptcp_setsockopt_sol_socket_int(msk, optname, optval,
346 optlen);
347 case SO_TIMESTAMPING_OLD:
348 case SO_TIMESTAMPING_NEW:
349 return mptcp_setsockopt_sol_socket_timestamping(msk, optname,
350 optval, optlen);
351 case SO_LINGER:
352 return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen);
353 case SO_RCVLOWAT:
354 case SO_RCVTIMEO_OLD:
355 case SO_RCVTIMEO_NEW:
356 case SO_SNDTIMEO_OLD:
357 case SO_SNDTIMEO_NEW:
358 case SO_BUSY_POLL:
359 case SO_PREFER_BUSY_POLL:
360 case SO_BUSY_POLL_BUDGET:
361 /* No need to copy: only relevant for msk */
362 return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
363 case SO_NO_CHECK:
364 case SO_DONTROUTE:
365 case SO_BROADCAST:
366 case SO_BSDCOMPAT:
367 case SO_PASSCRED:
368 case SO_PASSPIDFD:
369 case SO_PASSSEC:
370 case SO_RXQ_OVFL:
371 case SO_WIFI_STATUS:
372 case SO_NOFCS:
373 case SO_SELECT_ERR_QUEUE:
374 return 0;
375 }
376
377 /* SO_OOBINLINE is not supported, let's avoid the related mess
378 * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF,
379 * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER,
380 * we must be careful with subflows
381 *
382 * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks
383 * explicitly the sk_protocol field
384 *
385 * SO_PEEK_OFF is unsupported, as it is for plain TCP
386 * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows
387 * SO_CNX_ADVICE is currently unsupported, could possibly be relevant,
388 * but likely needs careful design
389 *
390 * SO_ZEROCOPY is currently unsupported, TODO in sndmsg
391 * SO_TXTIME is currently unsupported
392 */
393
394 return -EOPNOTSUPP;
395 }
396
mptcp_setsockopt_v6(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)397 static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
398 sockptr_t optval, unsigned int optlen)
399 {
400 struct sock *sk = (struct sock *)msk;
401 int ret = -EOPNOTSUPP;
402 struct sock *ssk;
403
404 switch (optname) {
405 case IPV6_V6ONLY:
406 case IPV6_TRANSPARENT:
407 case IPV6_FREEBIND:
408 lock_sock(sk);
409 ssk = __mptcp_nmpc_sk(msk);
410 if (IS_ERR(ssk)) {
411 release_sock(sk);
412 return PTR_ERR(ssk);
413 }
414
415 ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen);
416 if (ret != 0) {
417 release_sock(sk);
418 return ret;
419 }
420
421 sockopt_seq_inc(msk);
422
423 switch (optname) {
424 case IPV6_V6ONLY:
425 sk->sk_ipv6only = ssk->sk_ipv6only;
426 break;
427 case IPV6_TRANSPARENT:
428 inet_assign_bit(TRANSPARENT, sk,
429 inet_test_bit(TRANSPARENT, ssk));
430 break;
431 case IPV6_FREEBIND:
432 inet_assign_bit(FREEBIND, sk,
433 inet_test_bit(FREEBIND, ssk));
434 break;
435 }
436
437 release_sock(sk);
438 break;
439 }
440
441 return ret;
442 }
443
mptcp_supported_sockopt(int level,int optname)444 static bool mptcp_supported_sockopt(int level, int optname)
445 {
446 if (level == SOL_IP) {
447 switch (optname) {
448 /* should work fine */
449 case IP_FREEBIND:
450 case IP_TRANSPARENT:
451 case IP_BIND_ADDRESS_NO_PORT:
452 case IP_LOCAL_PORT_RANGE:
453
454 /* the following are control cmsg related */
455 case IP_PKTINFO:
456 case IP_RECVTTL:
457 case IP_RECVTOS:
458 case IP_RECVOPTS:
459 case IP_RETOPTS:
460 case IP_PASSSEC:
461 case IP_RECVORIGDSTADDR:
462 case IP_CHECKSUM:
463 case IP_RECVFRAGSIZE:
464
465 /* common stuff that need some love */
466 case IP_TOS:
467 case IP_TTL:
468 case IP_MTU_DISCOVER:
469 case IP_RECVERR:
470
471 /* possibly less common may deserve some love */
472 case IP_MINTTL:
473
474 /* the following is apparently a no-op for plain TCP */
475 case IP_RECVERR_RFC4884:
476 return true;
477 }
478
479 /* IP_OPTIONS is not supported, needs subflow care */
480 /* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */
481 /* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF,
482 * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP,
483 * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE,
484 * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP,
485 * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE,
486 * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal
487 * with mcast stuff
488 */
489 /* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */
490 return false;
491 }
492 if (level == SOL_IPV6) {
493 switch (optname) {
494 case IPV6_V6ONLY:
495
496 /* the following are control cmsg related */
497 case IPV6_RECVPKTINFO:
498 case IPV6_2292PKTINFO:
499 case IPV6_RECVHOPLIMIT:
500 case IPV6_2292HOPLIMIT:
501 case IPV6_RECVRTHDR:
502 case IPV6_2292RTHDR:
503 case IPV6_RECVHOPOPTS:
504 case IPV6_2292HOPOPTS:
505 case IPV6_RECVDSTOPTS:
506 case IPV6_2292DSTOPTS:
507 case IPV6_RECVTCLASS:
508 case IPV6_FLOWINFO:
509 case IPV6_RECVPATHMTU:
510 case IPV6_RECVORIGDSTADDR:
511 case IPV6_RECVFRAGSIZE:
512
513 /* the following ones need some love but are quite common */
514 case IPV6_TCLASS:
515 case IPV6_TRANSPARENT:
516 case IPV6_FREEBIND:
517 case IPV6_PKTINFO:
518 case IPV6_2292PKTOPTIONS:
519 case IPV6_UNICAST_HOPS:
520 case IPV6_MTU_DISCOVER:
521 case IPV6_MTU:
522 case IPV6_RECVERR:
523 case IPV6_FLOWINFO_SEND:
524 case IPV6_FLOWLABEL_MGR:
525 case IPV6_MINHOPCOUNT:
526 case IPV6_DONTFRAG:
527 case IPV6_AUTOFLOWLABEL:
528
529 /* the following one is a no-op for plain TCP */
530 case IPV6_RECVERR_RFC4884:
531 return true;
532 }
533
534 /* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are
535 * not supported
536 */
537 /* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF,
538 * IPV6_MULTICAST_IF, IPV6_ADDRFORM,
539 * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST,
540 * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP,
541 * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP,
542 * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER
543 * are not supported better not deal with mcast
544 */
545 /* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */
546
547 /* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */
548 /* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */
549 return false;
550 }
551 if (level == SOL_TCP) {
552 switch (optname) {
553 /* the following are no-op or should work just fine */
554 case TCP_THIN_DUPACK:
555 case TCP_DEFER_ACCEPT:
556
557 /* the following need some love */
558 case TCP_MAXSEG:
559 case TCP_NODELAY:
560 case TCP_THIN_LINEAR_TIMEOUTS:
561 case TCP_CONGESTION:
562 case TCP_CORK:
563 case TCP_KEEPIDLE:
564 case TCP_KEEPINTVL:
565 case TCP_KEEPCNT:
566 case TCP_SYNCNT:
567 case TCP_SAVE_SYN:
568 case TCP_LINGER2:
569 case TCP_WINDOW_CLAMP:
570 case TCP_QUICKACK:
571 case TCP_USER_TIMEOUT:
572 case TCP_TIMESTAMP:
573 case TCP_NOTSENT_LOWAT:
574 case TCP_TX_DELAY:
575 case TCP_INQ:
576 case TCP_FASTOPEN:
577 case TCP_FASTOPEN_CONNECT:
578 case TCP_FASTOPEN_KEY:
579 case TCP_FASTOPEN_NO_COOKIE:
580 return true;
581 }
582
583 /* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */
584
585 /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS,
586 * TCP_REPAIR_WINDOW are not supported, better avoid this mess
587 */
588 }
589 return false;
590 }
591
mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock * msk,sockptr_t optval,unsigned int optlen)592 static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t optval,
593 unsigned int optlen)
594 {
595 struct mptcp_subflow_context *subflow;
596 struct sock *sk = (struct sock *)msk;
597 char name[TCP_CA_NAME_MAX];
598 bool cap_net_admin;
599 int ret;
600
601 if (optlen < 1)
602 return -EINVAL;
603
604 ret = strncpy_from_sockptr(name, optval,
605 min_t(long, TCP_CA_NAME_MAX - 1, optlen));
606 if (ret < 0)
607 return -EFAULT;
608
609 name[ret] = 0;
610
611 cap_net_admin = ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN);
612
613 ret = 0;
614 lock_sock(sk);
615 sockopt_seq_inc(msk);
616 mptcp_for_each_subflow(msk, subflow) {
617 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
618 int err;
619
620 lock_sock(ssk);
621 err = tcp_set_congestion_control(ssk, name, true, cap_net_admin);
622 if (err < 0 && ret == 0)
623 ret = err;
624 subflow->setsockopt_seq = msk->setsockopt_seq;
625 release_sock(ssk);
626 }
627
628 if (ret == 0)
629 strscpy(msk->ca_name, name, sizeof(msk->ca_name));
630
631 release_sock(sk);
632 return ret;
633 }
634
__mptcp_setsockopt_set_val(struct mptcp_sock * msk,int max,int (* set_val)(struct sock *,int),int * msk_val,int val)635 static int __mptcp_setsockopt_set_val(struct mptcp_sock *msk, int max,
636 int (*set_val)(struct sock *, int),
637 int *msk_val, int val)
638 {
639 struct mptcp_subflow_context *subflow;
640 int err = 0;
641
642 mptcp_for_each_subflow(msk, subflow) {
643 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
644 int ret;
645
646 lock_sock(ssk);
647 ret = set_val(ssk, val);
648 err = err ? : ret;
649 release_sock(ssk);
650 }
651
652 if (!err) {
653 *msk_val = val;
654 sockopt_seq_inc(msk);
655 }
656
657 return err;
658 }
659
__mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock * msk,int val)660 static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val)
661 {
662 struct mptcp_subflow_context *subflow;
663 struct sock *sk = (struct sock *)msk;
664
665 sockopt_seq_inc(msk);
666 msk->cork = !!val;
667 mptcp_for_each_subflow(msk, subflow) {
668 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
669
670 lock_sock(ssk);
671 __tcp_sock_set_cork(ssk, !!val);
672 release_sock(ssk);
673 }
674 if (!val)
675 mptcp_check_and_set_pending(sk);
676
677 return 0;
678 }
679
__mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock * msk,int val)680 static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val)
681 {
682 struct mptcp_subflow_context *subflow;
683 struct sock *sk = (struct sock *)msk;
684
685 sockopt_seq_inc(msk);
686 msk->nodelay = !!val;
687 mptcp_for_each_subflow(msk, subflow) {
688 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
689
690 lock_sock(ssk);
691 __tcp_sock_set_nodelay(ssk, !!val);
692 release_sock(ssk);
693 }
694 if (val)
695 mptcp_check_and_set_pending(sk);
696 return 0;
697 }
698
mptcp_setsockopt_sol_ip_set(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)699 static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname,
700 sockptr_t optval, unsigned int optlen)
701 {
702 struct sock *sk = (struct sock *)msk;
703 struct sock *ssk;
704 int err;
705
706 err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
707 if (err != 0)
708 return err;
709
710 lock_sock(sk);
711
712 ssk = __mptcp_nmpc_sk(msk);
713 if (IS_ERR(ssk)) {
714 release_sock(sk);
715 return PTR_ERR(ssk);
716 }
717
718 switch (optname) {
719 case IP_FREEBIND:
720 inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
721 break;
722 case IP_TRANSPARENT:
723 inet_assign_bit(TRANSPARENT, ssk,
724 inet_test_bit(TRANSPARENT, sk));
725 break;
726 case IP_BIND_ADDRESS_NO_PORT:
727 inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk,
728 inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
729 break;
730 case IP_LOCAL_PORT_RANGE:
731 WRITE_ONCE(inet_sk(ssk)->local_port_range,
732 READ_ONCE(inet_sk(sk)->local_port_range));
733 break;
734 default:
735 release_sock(sk);
736 WARN_ON_ONCE(1);
737 return -EOPNOTSUPP;
738 }
739
740 sockopt_seq_inc(msk);
741 release_sock(sk);
742 return 0;
743 }
744
mptcp_setsockopt_v4_set_tos(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)745 static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
746 sockptr_t optval, unsigned int optlen)
747 {
748 struct mptcp_subflow_context *subflow;
749 struct sock *sk = (struct sock *)msk;
750 int err, val;
751
752 err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
753
754 if (err != 0)
755 return err;
756
757 lock_sock(sk);
758 sockopt_seq_inc(msk);
759 val = READ_ONCE(inet_sk(sk)->tos);
760 mptcp_for_each_subflow(msk, subflow) {
761 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
762 bool slow;
763
764 slow = lock_sock_fast(ssk);
765 __ip_sock_set_tos(ssk, val);
766 unlock_sock_fast(ssk, slow);
767 }
768 release_sock(sk);
769
770 return 0;
771 }
772
mptcp_setsockopt_v4(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)773 static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
774 sockptr_t optval, unsigned int optlen)
775 {
776 switch (optname) {
777 case IP_FREEBIND:
778 case IP_TRANSPARENT:
779 case IP_BIND_ADDRESS_NO_PORT:
780 case IP_LOCAL_PORT_RANGE:
781 return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen);
782 case IP_TOS:
783 return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen);
784 }
785
786 return -EOPNOTSUPP;
787 }
788
mptcp_setsockopt_first_sf_only(struct mptcp_sock * msk,int level,int optname,sockptr_t optval,unsigned int optlen)789 static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
790 sockptr_t optval, unsigned int optlen)
791 {
792 struct sock *sk = (struct sock *)msk;
793 struct sock *ssk;
794 int ret;
795
796 /* Limit to first subflow, before the connection establishment */
797 lock_sock(sk);
798 ssk = __mptcp_nmpc_sk(msk);
799 if (IS_ERR(ssk)) {
800 ret = PTR_ERR(ssk);
801 goto unlock;
802 }
803
804 ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
805
806 unlock:
807 release_sock(sk);
808 return ret;
809 }
810
mptcp_setsockopt_all_sf(struct mptcp_sock * msk,int level,int optname,sockptr_t optval,unsigned int optlen)811 static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level,
812 int optname, sockptr_t optval,
813 unsigned int optlen)
814 {
815 struct mptcp_subflow_context *subflow;
816 int ret = 0;
817
818 mptcp_for_each_subflow(msk, subflow) {
819 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
820 int err;
821
822 err = tcp_setsockopt(ssk, level, optname, optval, optlen);
823 if (err < 0 && ret == 0)
824 ret = err;
825 }
826
827 if (!ret)
828 sockopt_seq_inc(msk);
829
830 return ret;
831 }
832
mptcp_setsockopt_sol_tcp(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)833 static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
834 sockptr_t optval, unsigned int optlen)
835 {
836 struct sock *sk = (void *)msk;
837 int ret, val;
838
839 switch (optname) {
840 case TCP_ULP:
841 return -EOPNOTSUPP;
842 case TCP_CONGESTION:
843 return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
844 case TCP_DEFER_ACCEPT:
845 /* See tcp.c: TCP_DEFER_ACCEPT does not fail */
846 mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
847 return 0;
848 case TCP_FASTOPEN:
849 case TCP_FASTOPEN_CONNECT:
850 case TCP_FASTOPEN_KEY:
851 case TCP_FASTOPEN_NO_COOKIE:
852 return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname,
853 optval, optlen);
854 }
855
856 ret = mptcp_get_int_option(msk, optval, optlen, &val);
857 if (ret)
858 return ret;
859
860 lock_sock(sk);
861 switch (optname) {
862 case TCP_INQ:
863 if (val < 0 || val > 1)
864 ret = -EINVAL;
865 else
866 msk->recvmsg_inq = !!val;
867 break;
868 case TCP_NOTSENT_LOWAT:
869 WRITE_ONCE(msk->notsent_lowat, val);
870 mptcp_write_space(sk);
871 break;
872 case TCP_CORK:
873 ret = __mptcp_setsockopt_sol_tcp_cork(msk, val);
874 break;
875 case TCP_NODELAY:
876 ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val);
877 break;
878 case TCP_KEEPIDLE:
879 ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPIDLE,
880 &tcp_sock_set_keepidle_locked,
881 &msk->keepalive_idle, val);
882 break;
883 case TCP_KEEPINTVL:
884 ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPINTVL,
885 &tcp_sock_set_keepintvl,
886 &msk->keepalive_intvl, val);
887 break;
888 case TCP_KEEPCNT:
889 ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPCNT,
890 &tcp_sock_set_keepcnt,
891 &msk->keepalive_cnt,
892 val);
893 break;
894 case TCP_MAXSEG:
895 msk->maxseg = val;
896 ret = mptcp_setsockopt_all_sf(msk, SOL_TCP, optname, optval,
897 optlen);
898 break;
899 default:
900 ret = -ENOPROTOOPT;
901 }
902
903 release_sock(sk);
904 return ret;
905 }
906
mptcp_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)907 int mptcp_setsockopt(struct sock *sk, int level, int optname,
908 sockptr_t optval, unsigned int optlen)
909 {
910 struct mptcp_sock *msk = mptcp_sk(sk);
911 struct sock *ssk;
912
913 pr_debug("msk=%p\n", msk);
914
915 if (level == SOL_SOCKET)
916 return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
917
918 if (!mptcp_supported_sockopt(level, optname))
919 return -ENOPROTOOPT;
920
921 /* @@ the meaning of setsockopt() when the socket is connected and
922 * there are multiple subflows is not yet defined. It is up to the
923 * MPTCP-level socket to configure the subflows until the subflow
924 * is in TCP fallback, when TCP socket options are passed through
925 * to the one remaining subflow.
926 */
927 lock_sock(sk);
928 ssk = __mptcp_tcp_fallback(msk);
929 release_sock(sk);
930 if (ssk)
931 return tcp_setsockopt(ssk, level, optname, optval, optlen);
932
933 if (level == SOL_IP)
934 return mptcp_setsockopt_v4(msk, optname, optval, optlen);
935
936 if (level == SOL_IPV6)
937 return mptcp_setsockopt_v6(msk, optname, optval, optlen);
938
939 if (level == SOL_TCP)
940 return mptcp_setsockopt_sol_tcp(msk, optname, optval, optlen);
941
942 return -EOPNOTSUPP;
943 }
944
mptcp_getsockopt_first_sf_only(struct mptcp_sock * msk,int level,int optname,char __user * optval,int __user * optlen)945 static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
946 char __user *optval, int __user *optlen)
947 {
948 struct sock *sk = (struct sock *)msk;
949 struct sock *ssk;
950 int ret;
951
952 lock_sock(sk);
953 ssk = msk->first;
954 if (ssk)
955 goto get;
956
957 ssk = __mptcp_nmpc_sk(msk);
958 if (IS_ERR(ssk)) {
959 ret = PTR_ERR(ssk);
960 goto out;
961 }
962
963 get:
964 ret = tcp_getsockopt(ssk, level, optname, optval, optlen);
965
966 out:
967 release_sock(sk);
968 return ret;
969 }
970
mptcp_diag_fill_info(struct mptcp_sock * msk,struct mptcp_info * info)971 void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
972 {
973 struct sock *sk = (struct sock *)msk;
974 u32 flags = 0;
975 bool slow;
976 u32 now;
977
978 memset(info, 0, sizeof(*info));
979
980 info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows);
981 info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
982 info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
983 info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
984
985 if (inet_sk_state_load(sk) == TCP_LISTEN)
986 return;
987
988 /* The following limits only make sense for the in-kernel PM */
989 if (mptcp_pm_is_kernel(msk)) {
990 info->mptcpi_limit_extra_subflows =
991 mptcp_pm_get_limit_extra_subflows(msk);
992 info->mptcpi_endp_signal_max =
993 mptcp_pm_get_endp_signal_max(msk);
994 info->mptcpi_limit_add_addr_accepted =
995 mptcp_pm_get_limit_add_addr_accepted(msk);
996 info->mptcpi_endp_subflow_max =
997 mptcp_pm_get_endp_subflow_max(msk);
998 info->mptcpi_endp_laminar_max =
999 mptcp_pm_get_endp_laminar_max(msk);
1000 info->mptcpi_endp_fullmesh_max =
1001 mptcp_pm_get_endp_fullmesh_max(msk);
1002 }
1003
1004 if (__mptcp_check_fallback(msk))
1005 flags |= MPTCP_INFO_FLAG_FALLBACK;
1006 if (READ_ONCE(msk->can_ack))
1007 flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
1008 info->mptcpi_flags = flags;
1009
1010 slow = lock_sock_fast(sk);
1011 info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
1012 info->mptcpi_token = msk->token;
1013 info->mptcpi_write_seq = msk->write_seq;
1014 info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
1015 info->mptcpi_bytes_sent = msk->bytes_sent;
1016 info->mptcpi_bytes_received = msk->bytes_received;
1017 info->mptcpi_bytes_retrans = msk->bytes_retrans;
1018 info->mptcpi_subflows_total = info->mptcpi_extra_subflows +
1019 __mptcp_has_initial_subflow(msk);
1020 now = tcp_jiffies32;
1021 info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent);
1022 info->mptcpi_last_data_recv = jiffies_to_msecs(now - msk->last_data_recv);
1023 unlock_sock_fast(sk, slow);
1024
1025 mptcp_data_lock(sk);
1026 info->mptcpi_last_ack_recv = jiffies_to_msecs(now - msk->last_ack_recv);
1027 info->mptcpi_snd_una = msk->snd_una;
1028 info->mptcpi_rcv_nxt = msk->ack_seq;
1029 info->mptcpi_bytes_acked = msk->bytes_acked;
1030 mptcp_data_unlock(sk);
1031 }
1032 EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
1033
mptcp_getsockopt_info(struct mptcp_sock * msk,char __user * optval,int __user * optlen)1034 static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, int __user *optlen)
1035 {
1036 struct mptcp_info m_info;
1037 int len;
1038
1039 if (get_user(len, optlen))
1040 return -EFAULT;
1041
1042 /* When used only to check if a fallback to TCP happened. */
1043 if (len == 0)
1044 return 0;
1045
1046 len = min_t(unsigned int, len, sizeof(struct mptcp_info));
1047
1048 mptcp_diag_fill_info(msk, &m_info);
1049
1050 if (put_user(len, optlen))
1051 return -EFAULT;
1052
1053 if (copy_to_user(optval, &m_info, len))
1054 return -EFAULT;
1055
1056 return 0;
1057 }
1058
mptcp_put_subflow_data(struct mptcp_subflow_data * sfd,char __user * optval,u32 copied,int __user * optlen)1059 static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd,
1060 char __user *optval,
1061 u32 copied,
1062 int __user *optlen)
1063 {
1064 u32 copylen = min_t(u32, sfd->size_subflow_data, sizeof(*sfd));
1065
1066 if (copied)
1067 copied += sfd->size_subflow_data;
1068 else
1069 copied = copylen;
1070
1071 if (put_user(copied, optlen))
1072 return -EFAULT;
1073
1074 if (copy_to_user(optval, sfd, copylen))
1075 return -EFAULT;
1076
1077 return 0;
1078 }
1079
mptcp_get_subflow_data(struct mptcp_subflow_data * sfd,char __user * optval,int __user * optlen)1080 static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd,
1081 char __user *optval,
1082 int __user *optlen)
1083 {
1084 int len, copylen;
1085
1086 if (get_user(len, optlen))
1087 return -EFAULT;
1088
1089 /* if mptcp_subflow_data size is changed, need to adjust
1090 * this function to deal with programs using old version.
1091 */
1092 BUILD_BUG_ON(sizeof(*sfd) != MIN_INFO_OPTLEN_SIZE);
1093
1094 if (len < MIN_INFO_OPTLEN_SIZE)
1095 return -EINVAL;
1096
1097 memset(sfd, 0, sizeof(*sfd));
1098
1099 copylen = min_t(unsigned int, len, sizeof(*sfd));
1100 if (copy_from_user(sfd, optval, copylen))
1101 return -EFAULT;
1102
1103 /* size_subflow_data is u32, but len is signed */
1104 if (sfd->size_subflow_data > INT_MAX ||
1105 sfd->size_user > INT_MAX)
1106 return -EINVAL;
1107
1108 if (sfd->size_subflow_data < MIN_INFO_OPTLEN_SIZE ||
1109 sfd->size_subflow_data > len)
1110 return -EINVAL;
1111
1112 if (sfd->num_subflows || sfd->size_kernel)
1113 return -EINVAL;
1114
1115 return len - sfd->size_subflow_data;
1116 }
1117
mptcp_getsockopt_tcpinfo(struct mptcp_sock * msk,char __user * optval,int __user * optlen)1118 static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval,
1119 int __user *optlen)
1120 {
1121 struct mptcp_subflow_context *subflow;
1122 struct sock *sk = (struct sock *)msk;
1123 unsigned int sfcount = 0, copied = 0;
1124 struct mptcp_subflow_data sfd;
1125 char __user *infoptr;
1126 int len;
1127
1128 len = mptcp_get_subflow_data(&sfd, optval, optlen);
1129 if (len < 0)
1130 return len;
1131
1132 sfd.size_kernel = sizeof(struct tcp_info);
1133 sfd.size_user = min_t(unsigned int, sfd.size_user,
1134 sizeof(struct tcp_info));
1135
1136 infoptr = optval + sfd.size_subflow_data;
1137
1138 lock_sock(sk);
1139
1140 mptcp_for_each_subflow(msk, subflow) {
1141 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1142
1143 ++sfcount;
1144
1145 if (len && len >= sfd.size_user) {
1146 struct tcp_info info;
1147
1148 tcp_get_info(ssk, &info);
1149
1150 if (copy_to_user(infoptr, &info, sfd.size_user)) {
1151 release_sock(sk);
1152 return -EFAULT;
1153 }
1154
1155 infoptr += sfd.size_user;
1156 copied += sfd.size_user;
1157 len -= sfd.size_user;
1158 }
1159 }
1160
1161 release_sock(sk);
1162
1163 sfd.num_subflows = sfcount;
1164
1165 if (mptcp_put_subflow_data(&sfd, optval, copied, optlen))
1166 return -EFAULT;
1167
1168 return 0;
1169 }
1170
mptcp_get_sub_addrs(const struct sock * sk,struct mptcp_subflow_addrs * a)1171 static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a)
1172 {
1173 const struct inet_sock *inet = inet_sk(sk);
1174
1175 memset(a, 0, sizeof(*a));
1176
1177 if (sk->sk_family == AF_INET) {
1178 a->sin_local.sin_family = AF_INET;
1179 a->sin_local.sin_port = inet->inet_sport;
1180 a->sin_local.sin_addr.s_addr = inet->inet_rcv_saddr;
1181
1182 if (!a->sin_local.sin_addr.s_addr)
1183 a->sin_local.sin_addr.s_addr = inet->inet_saddr;
1184
1185 a->sin_remote.sin_family = AF_INET;
1186 a->sin_remote.sin_port = inet->inet_dport;
1187 a->sin_remote.sin_addr.s_addr = inet->inet_daddr;
1188 #if IS_ENABLED(CONFIG_IPV6)
1189 } else if (sk->sk_family == AF_INET6) {
1190 const struct ipv6_pinfo *np = inet6_sk(sk);
1191
1192 if (WARN_ON_ONCE(!np))
1193 return;
1194
1195 a->sin6_local.sin6_family = AF_INET6;
1196 a->sin6_local.sin6_port = inet->inet_sport;
1197
1198 if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
1199 a->sin6_local.sin6_addr = np->saddr;
1200 else
1201 a->sin6_local.sin6_addr = sk->sk_v6_rcv_saddr;
1202
1203 a->sin6_remote.sin6_family = AF_INET6;
1204 a->sin6_remote.sin6_port = inet->inet_dport;
1205 a->sin6_remote.sin6_addr = sk->sk_v6_daddr;
1206 #endif
1207 }
1208 }
1209
mptcp_getsockopt_subflow_addrs(struct mptcp_sock * msk,char __user * optval,int __user * optlen)1210 static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *optval,
1211 int __user *optlen)
1212 {
1213 struct mptcp_subflow_context *subflow;
1214 struct sock *sk = (struct sock *)msk;
1215 unsigned int sfcount = 0, copied = 0;
1216 struct mptcp_subflow_data sfd;
1217 char __user *addrptr;
1218 int len;
1219
1220 len = mptcp_get_subflow_data(&sfd, optval, optlen);
1221 if (len < 0)
1222 return len;
1223
1224 sfd.size_kernel = sizeof(struct mptcp_subflow_addrs);
1225 sfd.size_user = min_t(unsigned int, sfd.size_user,
1226 sizeof(struct mptcp_subflow_addrs));
1227
1228 addrptr = optval + sfd.size_subflow_data;
1229
1230 lock_sock(sk);
1231
1232 mptcp_for_each_subflow(msk, subflow) {
1233 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1234
1235 ++sfcount;
1236
1237 if (len && len >= sfd.size_user) {
1238 struct mptcp_subflow_addrs a;
1239
1240 mptcp_get_sub_addrs(ssk, &a);
1241
1242 if (copy_to_user(addrptr, &a, sfd.size_user)) {
1243 release_sock(sk);
1244 return -EFAULT;
1245 }
1246
1247 addrptr += sfd.size_user;
1248 copied += sfd.size_user;
1249 len -= sfd.size_user;
1250 }
1251 }
1252
1253 release_sock(sk);
1254
1255 sfd.num_subflows = sfcount;
1256
1257 if (mptcp_put_subflow_data(&sfd, optval, copied, optlen))
1258 return -EFAULT;
1259
1260 return 0;
1261 }
1262
mptcp_get_full_info(struct mptcp_full_info * mfi,char __user * optval,int __user * optlen)1263 static int mptcp_get_full_info(struct mptcp_full_info *mfi,
1264 char __user *optval,
1265 int __user *optlen)
1266 {
1267 int len;
1268
1269 BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) !=
1270 MIN_FULL_INFO_OPTLEN_SIZE);
1271
1272 if (get_user(len, optlen))
1273 return -EFAULT;
1274
1275 if (len < MIN_FULL_INFO_OPTLEN_SIZE)
1276 return -EINVAL;
1277
1278 memset(mfi, 0, sizeof(*mfi));
1279 if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE))
1280 return -EFAULT;
1281
1282 if (mfi->size_tcpinfo_kernel ||
1283 mfi->size_sfinfo_kernel ||
1284 mfi->num_subflows)
1285 return -EINVAL;
1286
1287 if (mfi->size_sfinfo_user > INT_MAX ||
1288 mfi->size_tcpinfo_user > INT_MAX)
1289 return -EINVAL;
1290
1291 return len - MIN_FULL_INFO_OPTLEN_SIZE;
1292 }
1293
mptcp_put_full_info(struct mptcp_full_info * mfi,char __user * optval,u32 copylen,int __user * optlen)1294 static int mptcp_put_full_info(struct mptcp_full_info *mfi,
1295 char __user *optval,
1296 u32 copylen,
1297 int __user *optlen)
1298 {
1299 copylen += MIN_FULL_INFO_OPTLEN_SIZE;
1300 if (put_user(copylen, optlen))
1301 return -EFAULT;
1302
1303 if (copy_to_user(optval, mfi, copylen))
1304 return -EFAULT;
1305 return 0;
1306 }
1307
mptcp_getsockopt_full_info(struct mptcp_sock * msk,char __user * optval,int __user * optlen)1308 static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval,
1309 int __user *optlen)
1310 {
1311 unsigned int sfcount = 0, copylen = 0;
1312 struct mptcp_subflow_context *subflow;
1313 struct sock *sk = (struct sock *)msk;
1314 void __user *tcpinfoptr, *sfinfoptr;
1315 struct mptcp_full_info mfi;
1316 int len;
1317
1318 len = mptcp_get_full_info(&mfi, optval, optlen);
1319 if (len < 0)
1320 return len;
1321
1322 /* don't bother filling the mptcp info if there is not enough
1323 * user-space-provided storage
1324 */
1325 if (len > 0) {
1326 mptcp_diag_fill_info(msk, &mfi.mptcp_info);
1327 copylen += min_t(unsigned int, len, sizeof(struct mptcp_info));
1328 }
1329
1330 mfi.size_tcpinfo_kernel = sizeof(struct tcp_info);
1331 mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user,
1332 sizeof(struct tcp_info));
1333 sfinfoptr = u64_to_user_ptr(mfi.subflow_info);
1334 mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info);
1335 mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user,
1336 sizeof(struct mptcp_subflow_info));
1337 tcpinfoptr = u64_to_user_ptr(mfi.tcp_info);
1338
1339 lock_sock(sk);
1340 mptcp_for_each_subflow(msk, subflow) {
1341 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1342 struct mptcp_subflow_info sfinfo;
1343 struct tcp_info tcp_info;
1344
1345 if (sfcount++ >= mfi.size_arrays_user)
1346 continue;
1347
1348 /* fetch addr/tcp_info only if the user space buffers
1349 * are wide enough
1350 */
1351 memset(&sfinfo, 0, sizeof(sfinfo));
1352 sfinfo.id = subflow->subflow_id;
1353 if (mfi.size_sfinfo_user >
1354 offsetof(struct mptcp_subflow_info, addrs))
1355 mptcp_get_sub_addrs(ssk, &sfinfo.addrs);
1356 if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user))
1357 goto fail_release;
1358
1359 if (mfi.size_tcpinfo_user) {
1360 tcp_get_info(ssk, &tcp_info);
1361 if (copy_to_user(tcpinfoptr, &tcp_info,
1362 mfi.size_tcpinfo_user))
1363 goto fail_release;
1364 }
1365
1366 tcpinfoptr += mfi.size_tcpinfo_user;
1367 sfinfoptr += mfi.size_sfinfo_user;
1368 }
1369 release_sock(sk);
1370
1371 mfi.num_subflows = sfcount;
1372 if (mptcp_put_full_info(&mfi, optval, copylen, optlen))
1373 return -EFAULT;
1374
1375 return 0;
1376
1377 fail_release:
1378 release_sock(sk);
1379 return -EFAULT;
1380 }
1381
mptcp_put_int_option(struct mptcp_sock * msk,char __user * optval,int __user * optlen,int val)1382 static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
1383 int __user *optlen, int val)
1384 {
1385 int len;
1386
1387 if (get_user(len, optlen))
1388 return -EFAULT;
1389 if (len < 0)
1390 return -EINVAL;
1391
1392 if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
1393 unsigned char ucval = (unsigned char)val;
1394
1395 len = 1;
1396 if (put_user(len, optlen))
1397 return -EFAULT;
1398 if (copy_to_user(optval, &ucval, 1))
1399 return -EFAULT;
1400 } else {
1401 len = min_t(unsigned int, len, sizeof(int));
1402 if (put_user(len, optlen))
1403 return -EFAULT;
1404 if (copy_to_user(optval, &val, len))
1405 return -EFAULT;
1406 }
1407
1408 return 0;
1409 }
1410
mptcp_getsockopt_sol_tcp(struct mptcp_sock * msk,int optname,char __user * optval,int __user * optlen)1411 static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
1412 char __user *optval, int __user *optlen)
1413 {
1414 struct sock *sk = (void *)msk;
1415
1416 switch (optname) {
1417 case TCP_ULP:
1418 case TCP_CONGESTION:
1419 case TCP_INFO:
1420 case TCP_CC_INFO:
1421 case TCP_DEFER_ACCEPT:
1422 case TCP_FASTOPEN:
1423 case TCP_FASTOPEN_CONNECT:
1424 case TCP_FASTOPEN_KEY:
1425 case TCP_FASTOPEN_NO_COOKIE:
1426 return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
1427 optval, optlen);
1428 case TCP_INQ:
1429 return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq);
1430 case TCP_CORK:
1431 return mptcp_put_int_option(msk, optval, optlen, msk->cork);
1432 case TCP_NODELAY:
1433 return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
1434 case TCP_KEEPIDLE:
1435 return mptcp_put_int_option(msk, optval, optlen,
1436 msk->keepalive_idle ? :
1437 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_time) / HZ);
1438 case TCP_KEEPINTVL:
1439 return mptcp_put_int_option(msk, optval, optlen,
1440 msk->keepalive_intvl ? :
1441 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_intvl) / HZ);
1442 case TCP_KEEPCNT:
1443 return mptcp_put_int_option(msk, optval, optlen,
1444 msk->keepalive_cnt ? :
1445 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_probes));
1446 case TCP_NOTSENT_LOWAT:
1447 return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
1448 case TCP_IS_MPTCP:
1449 return mptcp_put_int_option(msk, optval, optlen, 1);
1450 case TCP_MAXSEG:
1451 return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
1452 optval, optlen);
1453 }
1454 return -EOPNOTSUPP;
1455 }
1456
mptcp_getsockopt_v4(struct mptcp_sock * msk,int optname,char __user * optval,int __user * optlen)1457 static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
1458 char __user *optval, int __user *optlen)
1459 {
1460 struct sock *sk = (void *)msk;
1461
1462 switch (optname) {
1463 case IP_TOS:
1464 return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
1465 case IP_FREEBIND:
1466 return mptcp_put_int_option(msk, optval, optlen,
1467 inet_test_bit(FREEBIND, sk));
1468 case IP_TRANSPARENT:
1469 return mptcp_put_int_option(msk, optval, optlen,
1470 inet_test_bit(TRANSPARENT, sk));
1471 case IP_BIND_ADDRESS_NO_PORT:
1472 return mptcp_put_int_option(msk, optval, optlen,
1473 inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
1474 case IP_LOCAL_PORT_RANGE:
1475 return mptcp_put_int_option(msk, optval, optlen,
1476 READ_ONCE(inet_sk(sk)->local_port_range));
1477 }
1478
1479 return -EOPNOTSUPP;
1480 }
1481
mptcp_getsockopt_v6(struct mptcp_sock * msk,int optname,char __user * optval,int __user * optlen)1482 static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname,
1483 char __user *optval, int __user *optlen)
1484 {
1485 struct sock *sk = (void *)msk;
1486
1487 switch (optname) {
1488 case IPV6_V6ONLY:
1489 return mptcp_put_int_option(msk, optval, optlen,
1490 sk->sk_ipv6only);
1491 case IPV6_TRANSPARENT:
1492 return mptcp_put_int_option(msk, optval, optlen,
1493 inet_test_bit(TRANSPARENT, sk));
1494 case IPV6_FREEBIND:
1495 return mptcp_put_int_option(msk, optval, optlen,
1496 inet_test_bit(FREEBIND, sk));
1497 }
1498
1499 return -EOPNOTSUPP;
1500 }
1501
mptcp_getsockopt_sol_mptcp(struct mptcp_sock * msk,int optname,char __user * optval,int __user * optlen)1502 static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname,
1503 char __user *optval, int __user *optlen)
1504 {
1505 switch (optname) {
1506 case MPTCP_INFO:
1507 return mptcp_getsockopt_info(msk, optval, optlen);
1508 case MPTCP_FULL_INFO:
1509 return mptcp_getsockopt_full_info(msk, optval, optlen);
1510 case MPTCP_TCPINFO:
1511 return mptcp_getsockopt_tcpinfo(msk, optval, optlen);
1512 case MPTCP_SUBFLOW_ADDRS:
1513 return mptcp_getsockopt_subflow_addrs(msk, optval, optlen);
1514 }
1515
1516 return -EOPNOTSUPP;
1517 }
1518
mptcp_getsockopt(struct sock * sk,int level,int optname,char __user * optval,int __user * option)1519 int mptcp_getsockopt(struct sock *sk, int level, int optname,
1520 char __user *optval, int __user *option)
1521 {
1522 struct mptcp_sock *msk = mptcp_sk(sk);
1523 struct sock *ssk;
1524
1525 pr_debug("msk=%p\n", msk);
1526
1527 /* @@ the meaning of setsockopt() when the socket is connected and
1528 * there are multiple subflows is not yet defined. It is up to the
1529 * MPTCP-level socket to configure the subflows until the subflow
1530 * is in TCP fallback, when socket options are passed through
1531 * to the one remaining subflow.
1532 */
1533 lock_sock(sk);
1534 ssk = __mptcp_tcp_fallback(msk);
1535 release_sock(sk);
1536 if (ssk)
1537 return tcp_getsockopt(ssk, level, optname, optval, option);
1538
1539 if (level == SOL_IP)
1540 return mptcp_getsockopt_v4(msk, optname, optval, option);
1541 if (level == SOL_IPV6)
1542 return mptcp_getsockopt_v6(msk, optname, optval, option);
1543 if (level == SOL_TCP)
1544 return mptcp_getsockopt_sol_tcp(msk, optname, optval, option);
1545 if (level == SOL_MPTCP)
1546 return mptcp_getsockopt_sol_mptcp(msk, optname, optval, option);
1547 return -EOPNOTSUPP;
1548 }
1549
sync_socket_options(struct mptcp_sock * msk,struct sock * ssk)1550 static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
1551 {
1552 static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK;
1553 struct sock *sk = (struct sock *)msk;
1554 bool keep_open;
1555
1556 keep_open = sock_flag(sk, SOCK_KEEPOPEN);
1557 if (ssk->sk_prot->keepalive)
1558 ssk->sk_prot->keepalive(ssk, keep_open);
1559 sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open);
1560
1561 ssk->sk_priority = sk->sk_priority;
1562 ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
1563 ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
1564 ssk->sk_ipv6only = sk->sk_ipv6only;
1565 __ip_sock_set_tos(ssk, inet_sk(sk)->tos);
1566
1567 if (sk->sk_userlocks & tx_rx_locks) {
1568 ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
1569 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) {
1570 WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
1571 mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
1572 }
1573 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1574 __mptcp_subflow_set_rcvbuf(ssk, sk->sk_rcvbuf);
1575 }
1576
1577 if (sock_flag(sk, SOCK_LINGER)) {
1578 ssk->sk_lingertime = sk->sk_lingertime;
1579 sock_set_flag(ssk, SOCK_LINGER);
1580 } else {
1581 sock_reset_flag(ssk, SOCK_LINGER);
1582 }
1583
1584 if (sk->sk_mark != ssk->sk_mark) {
1585 ssk->sk_mark = sk->sk_mark;
1586 sk_dst_reset(ssk);
1587 }
1588
1589 sock_valbool_flag(ssk, SOCK_DBG, sock_flag(sk, SOCK_DBG));
1590
1591 if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops)
1592 tcp_set_congestion_control(ssk, msk->ca_name, false, true);
1593 __tcp_sock_set_cork(ssk, !!msk->cork);
1594 __tcp_sock_set_nodelay(ssk, !!msk->nodelay);
1595 tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle);
1596 tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl);
1597 tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt);
1598 tcp_sock_set_maxseg(ssk, msk->maxseg);
1599
1600 inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk));
1601 inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
1602 inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
1603 WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range));
1604 }
1605
mptcp_sockopt_sync_locked(struct mptcp_sock * msk,struct sock * ssk)1606 void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
1607 {
1608 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1609
1610 msk_owned_by_me(msk);
1611
1612 ssk->sk_rcvlowat = 0;
1613
1614 /* subflows must ignore any latency-related settings: will not affect
1615 * the user-space - only the msk is relevant - but will foul the
1616 * mptcp scheduler
1617 */
1618 tcp_sk(ssk)->notsent_lowat = UINT_MAX;
1619
1620 if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
1621 sync_socket_options(msk, ssk);
1622
1623 subflow->setsockopt_seq = msk->setsockopt_seq;
1624 }
1625 }
1626
1627 /* unfortunately this is different enough from the tcp version so
1628 * that we can't factor it out
1629 */
mptcp_set_rcvlowat(struct sock * sk,int val)1630 int mptcp_set_rcvlowat(struct sock *sk, int val)
1631 {
1632 struct mptcp_subflow_context *subflow;
1633 int space, cap;
1634
1635 /* bpf can land here with a wrong sk type */
1636 if (sk->sk_protocol == IPPROTO_TCP)
1637 return -EINVAL;
1638
1639 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1640 cap = sk->sk_rcvbuf >> 1;
1641 else
1642 cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
1643 val = min(val, cap);
1644 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1645
1646 /* Check if we need to signal EPOLLIN right now */
1647 if (mptcp_epollin_ready(sk))
1648 sk->sk_data_ready(sk);
1649
1650 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1651 return 0;
1652
1653 space = mptcp_space_from_win(sk, val);
1654 if (space <= sk->sk_rcvbuf)
1655 return 0;
1656
1657 /* propagate the rcvbuf changes to all the subflows */
1658 WRITE_ONCE(sk->sk_rcvbuf, space);
1659 mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
1660 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1661 bool slow;
1662
1663 slow = lock_sock_fast(ssk);
1664 WRITE_ONCE(ssk->sk_rcvbuf, space);
1665 WRITE_ONCE(tcp_sk(ssk)->window_clamp, val);
1666 unlock_sock_fast(ssk, slow);
1667 }
1668 return 0;
1669 }
1670