1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3 *
4 * Copyright (c) 2021, Red Hat.
5 */
6
7 #define pr_fmt(fmt) "MPTCP: " fmt
8
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <net/sock.h>
12 #include <net/protocol.h>
13 #include <net/tcp.h>
14 #include <net/mptcp.h>
15 #include "protocol.h"
16
17 #define MIN_INFO_OPTLEN_SIZE 16
18 #define MIN_FULL_INFO_OPTLEN_SIZE 40
19
__mptcp_tcp_fallback(struct mptcp_sock * msk)20 static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
21 {
22 msk_owned_by_me(msk);
23
24 if (likely(!__mptcp_check_fallback(msk)))
25 return NULL;
26
27 return msk->first;
28 }
29
sockopt_seq_reset(const struct sock * sk)30 static u32 sockopt_seq_reset(const struct sock *sk)
31 {
32 sock_owned_by_me(sk);
33
34 /* Highbits contain state. Allows to distinguish sockopt_seq
35 * of listener and established:
36 * s0 = new_listener()
37 * sockopt(s0) - seq is 1
38 * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0)
39 * sockopt(s0) - seq increments to 2 on s0
40 * sockopt(s1) // seq increments to 2 on s1 (different option)
41 * new ssk completes join, inherits options from s0 // seq 2
42 * Needs sync from mptcp join logic, but ssk->seq == msk->seq
43 *
44 * Set High order bits to sk_state so ssk->seq == msk->seq test
45 * will fail.
46 */
47
48 return (u32)sk->sk_state << 24u;
49 }
50
sockopt_seq_inc(struct mptcp_sock * msk)51 static void sockopt_seq_inc(struct mptcp_sock *msk)
52 {
53 u32 seq = (msk->setsockopt_seq + 1) & 0x00ffffff;
54
55 msk->setsockopt_seq = sockopt_seq_reset((struct sock *)msk) + seq;
56 }
57
mptcp_get_int_option(struct mptcp_sock * msk,sockptr_t optval,unsigned int optlen,int * val)58 static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval,
59 unsigned int optlen, int *val)
60 {
61 if (optlen < sizeof(int))
62 return -EINVAL;
63
64 if (copy_from_sockptr(val, optval, sizeof(*val)))
65 return -EFAULT;
66
67 return 0;
68 }
69
__mptcp_subflow_set_rcvbuf(struct sock * ssk,int val)70 static void __mptcp_subflow_set_rcvbuf(struct sock *ssk, int val)
71 {
72 WRITE_ONCE(ssk->sk_rcvbuf, val);
73 tcp_set_rcvbuf(ssk, val);
74 }
75
mptcp_sol_socket_sync_intval(struct mptcp_sock * msk,int optname,int val)76 static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val)
77 {
78 struct mptcp_subflow_context *subflow;
79 struct sock *sk = (struct sock *)msk;
80
81 lock_sock(sk);
82 sockopt_seq_inc(msk);
83
84 mptcp_for_each_subflow(msk, subflow) {
85 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
86 bool slow = lock_sock_fast(ssk);
87
88 switch (optname) {
89 case SO_DEBUG:
90 sock_valbool_flag(ssk, SOCK_DBG, !!val);
91 break;
92 case SO_KEEPALIVE:
93 if (ssk->sk_prot->keepalive)
94 ssk->sk_prot->keepalive(ssk, !!val);
95 sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val);
96 break;
97 case SO_PRIORITY:
98 WRITE_ONCE(ssk->sk_priority, val);
99 break;
100 case SO_SNDBUF:
101 case SO_SNDBUFFORCE:
102 ssk->sk_userlocks |= SOCK_SNDBUF_LOCK;
103 WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
104 mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
105 break;
106 case SO_RCVBUF:
107 case SO_RCVBUFFORCE:
108 ssk->sk_userlocks |= SOCK_RCVBUF_LOCK;
109 __mptcp_subflow_set_rcvbuf(ssk, sk->sk_rcvbuf);
110 break;
111 case SO_MARK:
112 if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) {
113 WRITE_ONCE(ssk->sk_mark, sk->sk_mark);
114 sk_dst_reset(ssk);
115 }
116 break;
117 case SO_INCOMING_CPU:
118 WRITE_ONCE(ssk->sk_incoming_cpu, val);
119 break;
120 }
121
122 subflow->setsockopt_seq = msk->setsockopt_seq;
123 unlock_sock_fast(ssk, slow);
124 }
125
126 release_sock(sk);
127 }
128
mptcp_sol_socket_intval(struct mptcp_sock * msk,int optname,int val)129 static int mptcp_sol_socket_intval(struct mptcp_sock *msk, int optname, int val)
130 {
131 sockptr_t optval = KERNEL_SOCKPTR(&val);
132 struct sock *sk = (struct sock *)msk;
133 int ret;
134
135 ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
136 optval, sizeof(val));
137 if (ret)
138 return ret;
139
140 mptcp_sol_socket_sync_intval(msk, optname, val);
141 return 0;
142 }
143
mptcp_so_incoming_cpu(struct mptcp_sock * msk,int val)144 static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val)
145 {
146 struct sock *sk = (struct sock *)msk;
147
148 WRITE_ONCE(sk->sk_incoming_cpu, val);
149
150 mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val);
151 }
152
mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock * msk,int optname,int val)153 static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optname, int val)
154 {
155 sockptr_t optval = KERNEL_SOCKPTR(&val);
156 struct mptcp_subflow_context *subflow;
157 struct sock *sk = (struct sock *)msk;
158 int ret;
159
160 ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
161 optval, sizeof(val));
162 if (ret)
163 return ret;
164
165 lock_sock(sk);
166 mptcp_for_each_subflow(msk, subflow) {
167 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
168
169 lock_sock(ssk);
170 sock_set_timestamp(ssk, optname, !!val);
171 release_sock(ssk);
172 }
173
174 release_sock(sk);
175 return 0;
176 }
177
mptcp_setsockopt_sol_socket_int(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)178 static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
179 sockptr_t optval,
180 unsigned int optlen)
181 {
182 int val, ret;
183
184 ret = mptcp_get_int_option(msk, optval, optlen, &val);
185 if (ret)
186 return ret;
187
188 switch (optname) {
189 case SO_KEEPALIVE:
190 case SO_DEBUG:
191 case SO_MARK:
192 case SO_PRIORITY:
193 case SO_SNDBUF:
194 case SO_SNDBUFFORCE:
195 case SO_RCVBUF:
196 case SO_RCVBUFFORCE:
197 return mptcp_sol_socket_intval(msk, optname, val);
198 case SO_INCOMING_CPU:
199 mptcp_so_incoming_cpu(msk, val);
200 return 0;
201 case SO_TIMESTAMP_OLD:
202 case SO_TIMESTAMP_NEW:
203 case SO_TIMESTAMPNS_OLD:
204 case SO_TIMESTAMPNS_NEW:
205 return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val);
206 }
207
208 return -ENOPROTOOPT;
209 }
210
mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)211 static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
212 int optname,
213 sockptr_t optval,
214 unsigned int optlen)
215 {
216 struct mptcp_subflow_context *subflow;
217 struct sock *sk = (struct sock *)msk;
218 struct so_timestamping timestamping;
219 int ret;
220
221 if (optlen == sizeof(timestamping)) {
222 if (copy_from_sockptr(×tamping, optval,
223 sizeof(timestamping)))
224 return -EFAULT;
225 } else if (optlen == sizeof(int)) {
226 memset(×tamping, 0, sizeof(timestamping));
227
228 if (copy_from_sockptr(×tamping.flags, optval, sizeof(int)))
229 return -EFAULT;
230 } else {
231 return -EINVAL;
232 }
233
234 ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
235 KERNEL_SOCKPTR(×tamping),
236 sizeof(timestamping));
237 if (ret)
238 return ret;
239
240 lock_sock(sk);
241
242 mptcp_for_each_subflow(msk, subflow) {
243 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
244
245 lock_sock(ssk);
246 sock_set_timestamping(ssk, optname, timestamping);
247 release_sock(ssk);
248 }
249
250 release_sock(sk);
251
252 return 0;
253 }
254
mptcp_setsockopt_sol_socket_linger(struct mptcp_sock * msk,sockptr_t optval,unsigned int optlen)255 static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval,
256 unsigned int optlen)
257 {
258 struct mptcp_subflow_context *subflow;
259 struct sock *sk = (struct sock *)msk;
260 struct linger ling;
261 sockptr_t kopt;
262 int ret;
263
264 if (optlen < sizeof(ling))
265 return -EINVAL;
266
267 if (copy_from_sockptr(&ling, optval, sizeof(ling)))
268 return -EFAULT;
269
270 kopt = KERNEL_SOCKPTR(&ling);
271 ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, SO_LINGER, kopt, sizeof(ling));
272 if (ret)
273 return ret;
274
275 lock_sock(sk);
276 sockopt_seq_inc(msk);
277 mptcp_for_each_subflow(msk, subflow) {
278 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
279 bool slow = lock_sock_fast(ssk);
280
281 if (!ling.l_onoff) {
282 sock_reset_flag(ssk, SOCK_LINGER);
283 } else {
284 ssk->sk_lingertime = sk->sk_lingertime;
285 sock_set_flag(ssk, SOCK_LINGER);
286 }
287
288 subflow->setsockopt_seq = msk->setsockopt_seq;
289 unlock_sock_fast(ssk, slow);
290 }
291
292 release_sock(sk);
293 return 0;
294 }
295
mptcp_setsockopt_sol_socket(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)296 static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
297 sockptr_t optval, unsigned int optlen)
298 {
299 struct sock *sk = (struct sock *)msk;
300 struct sock *ssk;
301 int ret;
302
303 switch (optname) {
304 case SO_REUSEPORT:
305 case SO_REUSEADDR:
306 case SO_BINDTODEVICE:
307 case SO_BINDTOIFINDEX:
308 lock_sock(sk);
309 ssk = __mptcp_nmpc_sk(msk);
310 if (IS_ERR(ssk)) {
311 release_sock(sk);
312 return PTR_ERR(ssk);
313 }
314
315 ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen);
316 if (ret == 0) {
317 if (optname == SO_REUSEPORT)
318 sk->sk_reuseport = ssk->sk_reuseport;
319 else if (optname == SO_REUSEADDR)
320 sk->sk_reuse = ssk->sk_reuse;
321 else if (optname == SO_BINDTODEVICE)
322 sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
323 else if (optname == SO_BINDTOIFINDEX)
324 sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
325 }
326 release_sock(sk);
327 return ret;
328 case SO_KEEPALIVE:
329 case SO_PRIORITY:
330 case SO_SNDBUF:
331 case SO_SNDBUFFORCE:
332 case SO_RCVBUF:
333 case SO_RCVBUFFORCE:
334 case SO_MARK:
335 case SO_INCOMING_CPU:
336 case SO_DEBUG:
337 case SO_TIMESTAMP_OLD:
338 case SO_TIMESTAMP_NEW:
339 case SO_TIMESTAMPNS_OLD:
340 case SO_TIMESTAMPNS_NEW:
341 return mptcp_setsockopt_sol_socket_int(msk, optname, optval,
342 optlen);
343 case SO_TIMESTAMPING_OLD:
344 case SO_TIMESTAMPING_NEW:
345 return mptcp_setsockopt_sol_socket_timestamping(msk, optname,
346 optval, optlen);
347 case SO_LINGER:
348 return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen);
349 case SO_RCVLOWAT:
350 case SO_RCVTIMEO_OLD:
351 case SO_RCVTIMEO_NEW:
352 case SO_SNDTIMEO_OLD:
353 case SO_SNDTIMEO_NEW:
354 case SO_BUSY_POLL:
355 case SO_PREFER_BUSY_POLL:
356 case SO_BUSY_POLL_BUDGET:
357 /* No need to copy: only relevant for msk */
358 return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
359 case SO_NO_CHECK:
360 case SO_DONTROUTE:
361 case SO_BROADCAST:
362 case SO_BSDCOMPAT:
363 case SO_PASSCRED:
364 case SO_PASSPIDFD:
365 case SO_PASSSEC:
366 case SO_RXQ_OVFL:
367 case SO_WIFI_STATUS:
368 case SO_NOFCS:
369 case SO_SELECT_ERR_QUEUE:
370 return 0;
371 }
372
373 /* SO_OOBINLINE is not supported, let's avoid the related mess
374 * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF,
375 * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER,
376 * we must be careful with subflows
377 *
378 * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks
379 * explicitly the sk_protocol field
380 *
381 * SO_PEEK_OFF is unsupported, as it is for plain TCP
382 * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows
383 * SO_CNX_ADVICE is currently unsupported, could possibly be relevant,
384 * but likely needs careful design
385 *
386 * SO_ZEROCOPY is currently unsupported, TODO in sndmsg
387 * SO_TXTIME is currently unsupported
388 */
389
390 return -EOPNOTSUPP;
391 }
392
mptcp_setsockopt_v6(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)393 static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
394 sockptr_t optval, unsigned int optlen)
395 {
396 struct sock *sk = (struct sock *)msk;
397 int ret = -EOPNOTSUPP;
398 struct sock *ssk;
399
400 switch (optname) {
401 case IPV6_V6ONLY:
402 case IPV6_TRANSPARENT:
403 case IPV6_FREEBIND:
404 lock_sock(sk);
405 ssk = __mptcp_nmpc_sk(msk);
406 if (IS_ERR(ssk)) {
407 release_sock(sk);
408 return PTR_ERR(ssk);
409 }
410
411 ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen);
412 if (ret != 0) {
413 release_sock(sk);
414 return ret;
415 }
416
417 sockopt_seq_inc(msk);
418
419 switch (optname) {
420 case IPV6_V6ONLY:
421 sk->sk_ipv6only = ssk->sk_ipv6only;
422 break;
423 case IPV6_TRANSPARENT:
424 inet_assign_bit(TRANSPARENT, sk,
425 inet_test_bit(TRANSPARENT, ssk));
426 break;
427 case IPV6_FREEBIND:
428 inet_assign_bit(FREEBIND, sk,
429 inet_test_bit(FREEBIND, ssk));
430 break;
431 }
432
433 release_sock(sk);
434 break;
435 }
436
437 return ret;
438 }
439
mptcp_supported_sockopt(int level,int optname)440 static bool mptcp_supported_sockopt(int level, int optname)
441 {
442 if (level == SOL_IP) {
443 switch (optname) {
444 /* should work fine */
445 case IP_FREEBIND:
446 case IP_TRANSPARENT:
447 case IP_BIND_ADDRESS_NO_PORT:
448 case IP_LOCAL_PORT_RANGE:
449
450 /* the following are control cmsg related */
451 case IP_PKTINFO:
452 case IP_RECVTTL:
453 case IP_RECVTOS:
454 case IP_RECVOPTS:
455 case IP_RETOPTS:
456 case IP_PASSSEC:
457 case IP_RECVORIGDSTADDR:
458 case IP_CHECKSUM:
459 case IP_RECVFRAGSIZE:
460
461 /* common stuff that need some love */
462 case IP_TOS:
463 case IP_TTL:
464 case IP_MTU_DISCOVER:
465 case IP_RECVERR:
466
467 /* possibly less common may deserve some love */
468 case IP_MINTTL:
469
470 /* the following is apparently a no-op for plain TCP */
471 case IP_RECVERR_RFC4884:
472 return true;
473 }
474
475 /* IP_OPTIONS is not supported, needs subflow care */
476 /* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */
477 /* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF,
478 * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP,
479 * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE,
480 * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP,
481 * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE,
482 * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal
483 * with mcast stuff
484 */
485 /* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */
486 return false;
487 }
488 if (level == SOL_IPV6) {
489 switch (optname) {
490 case IPV6_V6ONLY:
491
492 /* the following are control cmsg related */
493 case IPV6_RECVPKTINFO:
494 case IPV6_2292PKTINFO:
495 case IPV6_RECVHOPLIMIT:
496 case IPV6_2292HOPLIMIT:
497 case IPV6_RECVRTHDR:
498 case IPV6_2292RTHDR:
499 case IPV6_RECVHOPOPTS:
500 case IPV6_2292HOPOPTS:
501 case IPV6_RECVDSTOPTS:
502 case IPV6_2292DSTOPTS:
503 case IPV6_RECVTCLASS:
504 case IPV6_FLOWINFO:
505 case IPV6_RECVPATHMTU:
506 case IPV6_RECVORIGDSTADDR:
507 case IPV6_RECVFRAGSIZE:
508
509 /* the following ones need some love but are quite common */
510 case IPV6_TCLASS:
511 case IPV6_TRANSPARENT:
512 case IPV6_FREEBIND:
513 case IPV6_PKTINFO:
514 case IPV6_2292PKTOPTIONS:
515 case IPV6_UNICAST_HOPS:
516 case IPV6_MTU_DISCOVER:
517 case IPV6_MTU:
518 case IPV6_RECVERR:
519 case IPV6_FLOWINFO_SEND:
520 case IPV6_FLOWLABEL_MGR:
521 case IPV6_MINHOPCOUNT:
522 case IPV6_DONTFRAG:
523 case IPV6_AUTOFLOWLABEL:
524
525 /* the following one is a no-op for plain TCP */
526 case IPV6_RECVERR_RFC4884:
527 return true;
528 }
529
530 /* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are
531 * not supported
532 */
533 /* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF,
534 * IPV6_MULTICAST_IF, IPV6_ADDRFORM,
535 * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST,
536 * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP,
537 * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP,
538 * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER
539 * are not supported better not deal with mcast
540 */
541 /* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */
542
543 /* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */
544 /* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */
545 return false;
546 }
547 if (level == SOL_TCP) {
548 switch (optname) {
549 /* the following are no-op or should work just fine */
550 case TCP_THIN_DUPACK:
551 case TCP_DEFER_ACCEPT:
552
553 /* the following need some love */
554 case TCP_MAXSEG:
555 case TCP_NODELAY:
556 case TCP_THIN_LINEAR_TIMEOUTS:
557 case TCP_CONGESTION:
558 case TCP_CORK:
559 case TCP_KEEPIDLE:
560 case TCP_KEEPINTVL:
561 case TCP_KEEPCNT:
562 case TCP_SYNCNT:
563 case TCP_SAVE_SYN:
564 case TCP_LINGER2:
565 case TCP_WINDOW_CLAMP:
566 case TCP_QUICKACK:
567 case TCP_USER_TIMEOUT:
568 case TCP_TIMESTAMP:
569 case TCP_NOTSENT_LOWAT:
570 case TCP_TX_DELAY:
571 case TCP_INQ:
572 case TCP_FASTOPEN:
573 case TCP_FASTOPEN_CONNECT:
574 case TCP_FASTOPEN_KEY:
575 case TCP_FASTOPEN_NO_COOKIE:
576 return true;
577 }
578
579 /* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */
580
581 /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS,
582 * TCP_REPAIR_WINDOW are not supported, better avoid this mess
583 */
584 }
585 return false;
586 }
587
mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock * msk,sockptr_t optval,unsigned int optlen)588 static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t optval,
589 unsigned int optlen)
590 {
591 struct mptcp_subflow_context *subflow;
592 struct sock *sk = (struct sock *)msk;
593 char name[TCP_CA_NAME_MAX];
594 bool cap_net_admin;
595 int ret;
596
597 if (optlen < 1)
598 return -EINVAL;
599
600 ret = strncpy_from_sockptr(name, optval,
601 min_t(long, TCP_CA_NAME_MAX - 1, optlen));
602 if (ret < 0)
603 return -EFAULT;
604
605 name[ret] = 0;
606
607 cap_net_admin = ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN);
608
609 ret = 0;
610 lock_sock(sk);
611 sockopt_seq_inc(msk);
612 mptcp_for_each_subflow(msk, subflow) {
613 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
614 int err;
615
616 lock_sock(ssk);
617 err = tcp_set_congestion_control(ssk, name, true, cap_net_admin);
618 if (err < 0 && ret == 0)
619 ret = err;
620 subflow->setsockopt_seq = msk->setsockopt_seq;
621 release_sock(ssk);
622 }
623
624 if (ret == 0)
625 strscpy(msk->ca_name, name, sizeof(msk->ca_name));
626
627 release_sock(sk);
628 return ret;
629 }
630
__mptcp_setsockopt_set_val(struct mptcp_sock * msk,int max,int (* set_val)(struct sock *,int),int * msk_val,int val)631 static int __mptcp_setsockopt_set_val(struct mptcp_sock *msk, int max,
632 int (*set_val)(struct sock *, int),
633 int *msk_val, int val)
634 {
635 struct mptcp_subflow_context *subflow;
636 int err = 0;
637
638 mptcp_for_each_subflow(msk, subflow) {
639 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
640 int ret;
641
642 lock_sock(ssk);
643 ret = set_val(ssk, val);
644 err = err ? : ret;
645 release_sock(ssk);
646 }
647
648 if (!err) {
649 *msk_val = val;
650 sockopt_seq_inc(msk);
651 }
652
653 return err;
654 }
655
__mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock * msk,int val)656 static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val)
657 {
658 struct mptcp_subflow_context *subflow;
659 struct sock *sk = (struct sock *)msk;
660
661 sockopt_seq_inc(msk);
662 msk->cork = !!val;
663 mptcp_for_each_subflow(msk, subflow) {
664 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
665
666 lock_sock(ssk);
667 __tcp_sock_set_cork(ssk, !!val);
668 release_sock(ssk);
669 }
670 if (!val)
671 mptcp_check_and_set_pending(sk);
672
673 return 0;
674 }
675
__mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock * msk,int val)676 static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val)
677 {
678 struct mptcp_subflow_context *subflow;
679 struct sock *sk = (struct sock *)msk;
680
681 sockopt_seq_inc(msk);
682 msk->nodelay = !!val;
683 mptcp_for_each_subflow(msk, subflow) {
684 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
685
686 lock_sock(ssk);
687 __tcp_sock_set_nodelay(ssk, !!val);
688 release_sock(ssk);
689 }
690 if (val)
691 mptcp_check_and_set_pending(sk);
692 return 0;
693 }
694
mptcp_setsockopt_sol_ip_set(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)695 static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname,
696 sockptr_t optval, unsigned int optlen)
697 {
698 struct sock *sk = (struct sock *)msk;
699 struct sock *ssk;
700 int err;
701
702 err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
703 if (err != 0)
704 return err;
705
706 lock_sock(sk);
707
708 ssk = __mptcp_nmpc_sk(msk);
709 if (IS_ERR(ssk)) {
710 release_sock(sk);
711 return PTR_ERR(ssk);
712 }
713
714 switch (optname) {
715 case IP_FREEBIND:
716 inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
717 break;
718 case IP_TRANSPARENT:
719 inet_assign_bit(TRANSPARENT, ssk,
720 inet_test_bit(TRANSPARENT, sk));
721 break;
722 case IP_BIND_ADDRESS_NO_PORT:
723 inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk,
724 inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
725 break;
726 case IP_LOCAL_PORT_RANGE:
727 WRITE_ONCE(inet_sk(ssk)->local_port_range,
728 READ_ONCE(inet_sk(sk)->local_port_range));
729 break;
730 default:
731 release_sock(sk);
732 WARN_ON_ONCE(1);
733 return -EOPNOTSUPP;
734 }
735
736 sockopt_seq_inc(msk);
737 release_sock(sk);
738 return 0;
739 }
740
mptcp_setsockopt_v4_set_tos(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)741 static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
742 sockptr_t optval, unsigned int optlen)
743 {
744 struct mptcp_subflow_context *subflow;
745 struct sock *sk = (struct sock *)msk;
746 int err, val;
747
748 err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
749
750 if (err != 0)
751 return err;
752
753 lock_sock(sk);
754 sockopt_seq_inc(msk);
755 val = READ_ONCE(inet_sk(sk)->tos);
756 mptcp_for_each_subflow(msk, subflow) {
757 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
758 bool slow;
759
760 slow = lock_sock_fast(ssk);
761 __ip_sock_set_tos(ssk, val);
762 unlock_sock_fast(ssk, slow);
763 }
764 release_sock(sk);
765
766 return 0;
767 }
768
mptcp_setsockopt_v4(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)769 static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
770 sockptr_t optval, unsigned int optlen)
771 {
772 switch (optname) {
773 case IP_FREEBIND:
774 case IP_TRANSPARENT:
775 case IP_BIND_ADDRESS_NO_PORT:
776 case IP_LOCAL_PORT_RANGE:
777 return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen);
778 case IP_TOS:
779 return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen);
780 }
781
782 return -EOPNOTSUPP;
783 }
784
mptcp_setsockopt_first_sf_only(struct mptcp_sock * msk,int level,int optname,sockptr_t optval,unsigned int optlen)785 static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
786 sockptr_t optval, unsigned int optlen)
787 {
788 struct sock *sk = (struct sock *)msk;
789 struct sock *ssk;
790 int ret;
791
792 /* Limit to first subflow, before the connection establishment */
793 lock_sock(sk);
794 ssk = __mptcp_nmpc_sk(msk);
795 if (IS_ERR(ssk)) {
796 ret = PTR_ERR(ssk);
797 goto unlock;
798 }
799
800 ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
801
802 unlock:
803 release_sock(sk);
804 return ret;
805 }
806
mptcp_setsockopt_all_sf(struct mptcp_sock * msk,int level,int optname,sockptr_t optval,unsigned int optlen)807 static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level,
808 int optname, sockptr_t optval,
809 unsigned int optlen)
810 {
811 struct mptcp_subflow_context *subflow;
812 int ret = 0;
813
814 mptcp_for_each_subflow(msk, subflow) {
815 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
816
817 ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
818 if (ret)
819 break;
820 }
821
822 if (!ret)
823 sockopt_seq_inc(msk);
824
825 return ret;
826 }
827
mptcp_setsockopt_sol_tcp(struct mptcp_sock * msk,int optname,sockptr_t optval,unsigned int optlen)828 static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
829 sockptr_t optval, unsigned int optlen)
830 {
831 struct sock *sk = (void *)msk;
832 int ret, val;
833
834 switch (optname) {
835 case TCP_ULP:
836 return -EOPNOTSUPP;
837 case TCP_CONGESTION:
838 return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
839 case TCP_DEFER_ACCEPT:
840 /* See tcp.c: TCP_DEFER_ACCEPT does not fail */
841 mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
842 return 0;
843 case TCP_FASTOPEN:
844 case TCP_FASTOPEN_CONNECT:
845 case TCP_FASTOPEN_KEY:
846 case TCP_FASTOPEN_NO_COOKIE:
847 return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname,
848 optval, optlen);
849 }
850
851 ret = mptcp_get_int_option(msk, optval, optlen, &val);
852 if (ret)
853 return ret;
854
855 lock_sock(sk);
856 switch (optname) {
857 case TCP_INQ:
858 if (val < 0 || val > 1)
859 ret = -EINVAL;
860 else
861 msk->recvmsg_inq = !!val;
862 break;
863 case TCP_NOTSENT_LOWAT:
864 WRITE_ONCE(msk->notsent_lowat, val);
865 mptcp_write_space(sk);
866 break;
867 case TCP_CORK:
868 ret = __mptcp_setsockopt_sol_tcp_cork(msk, val);
869 break;
870 case TCP_NODELAY:
871 ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val);
872 break;
873 case TCP_KEEPIDLE:
874 ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPIDLE,
875 &tcp_sock_set_keepidle_locked,
876 &msk->keepalive_idle, val);
877 break;
878 case TCP_KEEPINTVL:
879 ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPINTVL,
880 &tcp_sock_set_keepintvl,
881 &msk->keepalive_intvl, val);
882 break;
883 case TCP_KEEPCNT:
884 ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPCNT,
885 &tcp_sock_set_keepcnt,
886 &msk->keepalive_cnt,
887 val);
888 break;
889 case TCP_MAXSEG:
890 msk->maxseg = val;
891 ret = mptcp_setsockopt_all_sf(msk, SOL_TCP, optname, optval,
892 optlen);
893 break;
894 default:
895 ret = -ENOPROTOOPT;
896 }
897
898 release_sock(sk);
899 return ret;
900 }
901
mptcp_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)902 int mptcp_setsockopt(struct sock *sk, int level, int optname,
903 sockptr_t optval, unsigned int optlen)
904 {
905 struct mptcp_sock *msk = mptcp_sk(sk);
906 struct sock *ssk;
907
908 pr_debug("msk=%p\n", msk);
909
910 if (level == SOL_SOCKET)
911 return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
912
913 if (!mptcp_supported_sockopt(level, optname))
914 return -ENOPROTOOPT;
915
916 /* @@ the meaning of setsockopt() when the socket is connected and
917 * there are multiple subflows is not yet defined. It is up to the
918 * MPTCP-level socket to configure the subflows until the subflow
919 * is in TCP fallback, when TCP socket options are passed through
920 * to the one remaining subflow.
921 */
922 lock_sock(sk);
923 ssk = __mptcp_tcp_fallback(msk);
924 release_sock(sk);
925 if (ssk)
926 return tcp_setsockopt(ssk, level, optname, optval, optlen);
927
928 if (level == SOL_IP)
929 return mptcp_setsockopt_v4(msk, optname, optval, optlen);
930
931 if (level == SOL_IPV6)
932 return mptcp_setsockopt_v6(msk, optname, optval, optlen);
933
934 if (level == SOL_TCP)
935 return mptcp_setsockopt_sol_tcp(msk, optname, optval, optlen);
936
937 return -EOPNOTSUPP;
938 }
939
mptcp_getsockopt_first_sf_only(struct mptcp_sock * msk,int level,int optname,char __user * optval,int __user * optlen)940 static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
941 char __user *optval, int __user *optlen)
942 {
943 struct sock *sk = (struct sock *)msk;
944 struct sock *ssk;
945 int ret;
946
947 lock_sock(sk);
948 ssk = msk->first;
949 if (ssk)
950 goto get;
951
952 ssk = __mptcp_nmpc_sk(msk);
953 if (IS_ERR(ssk)) {
954 ret = PTR_ERR(ssk);
955 goto out;
956 }
957
958 get:
959 ret = tcp_getsockopt(ssk, level, optname, optval, optlen);
960
961 out:
962 release_sock(sk);
963 return ret;
964 }
965
mptcp_diag_fill_info(struct mptcp_sock * msk,struct mptcp_info * info)966 void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
967 {
968 struct sock *sk = (struct sock *)msk;
969 u32 flags = 0;
970 bool slow;
971 u32 now;
972
973 memset(info, 0, sizeof(*info));
974
975 info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows);
976 info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
977 info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
978 info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
979
980 if (inet_sk_state_load(sk) == TCP_LISTEN)
981 return;
982
983 /* The following limits only make sense for the in-kernel PM */
984 if (mptcp_pm_is_kernel(msk)) {
985 info->mptcpi_limit_extra_subflows =
986 mptcp_pm_get_limit_extra_subflows(msk);
987 info->mptcpi_endp_signal_max =
988 mptcp_pm_get_endp_signal_max(msk);
989 info->mptcpi_limit_add_addr_accepted =
990 mptcp_pm_get_limit_add_addr_accepted(msk);
991 info->mptcpi_endp_subflow_max =
992 mptcp_pm_get_endp_subflow_max(msk);
993 info->mptcpi_endp_laminar_max =
994 mptcp_pm_get_endp_laminar_max(msk);
995 info->mptcpi_endp_fullmesh_max =
996 mptcp_pm_get_endp_fullmesh_max(msk);
997 }
998
999 if (__mptcp_check_fallback(msk))
1000 flags |= MPTCP_INFO_FLAG_FALLBACK;
1001 if (READ_ONCE(msk->can_ack))
1002 flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
1003 info->mptcpi_flags = flags;
1004
1005 slow = lock_sock_fast(sk);
1006 info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
1007 info->mptcpi_token = msk->token;
1008 info->mptcpi_write_seq = msk->write_seq;
1009 info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
1010 info->mptcpi_bytes_sent = msk->bytes_sent;
1011 info->mptcpi_bytes_received = msk->bytes_received;
1012 info->mptcpi_bytes_retrans = msk->bytes_retrans;
1013 info->mptcpi_subflows_total = info->mptcpi_extra_subflows +
1014 __mptcp_has_initial_subflow(msk);
1015 now = tcp_jiffies32;
1016 info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent);
1017 info->mptcpi_last_data_recv = jiffies_to_msecs(now - msk->last_data_recv);
1018 unlock_sock_fast(sk, slow);
1019
1020 mptcp_data_lock(sk);
1021 info->mptcpi_last_ack_recv = jiffies_to_msecs(now - msk->last_ack_recv);
1022 info->mptcpi_snd_una = msk->snd_una;
1023 info->mptcpi_rcv_nxt = msk->ack_seq;
1024 info->mptcpi_bytes_acked = msk->bytes_acked;
1025 mptcp_data_unlock(sk);
1026 }
1027 EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
1028
mptcp_getsockopt_info(struct mptcp_sock * msk,char __user * optval,int __user * optlen)1029 static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, int __user *optlen)
1030 {
1031 struct mptcp_info m_info;
1032 int len;
1033
1034 if (get_user(len, optlen))
1035 return -EFAULT;
1036
1037 /* When used only to check if a fallback to TCP happened. */
1038 if (len == 0)
1039 return 0;
1040
1041 len = min_t(unsigned int, len, sizeof(struct mptcp_info));
1042
1043 mptcp_diag_fill_info(msk, &m_info);
1044
1045 if (put_user(len, optlen))
1046 return -EFAULT;
1047
1048 if (copy_to_user(optval, &m_info, len))
1049 return -EFAULT;
1050
1051 return 0;
1052 }
1053
mptcp_put_subflow_data(struct mptcp_subflow_data * sfd,char __user * optval,u32 copied,int __user * optlen)1054 static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd,
1055 char __user *optval,
1056 u32 copied,
1057 int __user *optlen)
1058 {
1059 u32 copylen = min_t(u32, sfd->size_subflow_data, sizeof(*sfd));
1060
1061 if (copied)
1062 copied += sfd->size_subflow_data;
1063 else
1064 copied = copylen;
1065
1066 if (put_user(copied, optlen))
1067 return -EFAULT;
1068
1069 if (copy_to_user(optval, sfd, copylen))
1070 return -EFAULT;
1071
1072 return 0;
1073 }
1074
mptcp_get_subflow_data(struct mptcp_subflow_data * sfd,char __user * optval,int __user * optlen)1075 static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd,
1076 char __user *optval,
1077 int __user *optlen)
1078 {
1079 int len, copylen;
1080
1081 if (get_user(len, optlen))
1082 return -EFAULT;
1083
1084 /* if mptcp_subflow_data size is changed, need to adjust
1085 * this function to deal with programs using old version.
1086 */
1087 BUILD_BUG_ON(sizeof(*sfd) != MIN_INFO_OPTLEN_SIZE);
1088
1089 if (len < MIN_INFO_OPTLEN_SIZE)
1090 return -EINVAL;
1091
1092 memset(sfd, 0, sizeof(*sfd));
1093
1094 copylen = min_t(unsigned int, len, sizeof(*sfd));
1095 if (copy_from_user(sfd, optval, copylen))
1096 return -EFAULT;
1097
1098 /* size_subflow_data is u32, but len is signed */
1099 if (sfd->size_subflow_data > INT_MAX ||
1100 sfd->size_user > INT_MAX)
1101 return -EINVAL;
1102
1103 if (sfd->size_subflow_data < MIN_INFO_OPTLEN_SIZE ||
1104 sfd->size_subflow_data > len)
1105 return -EINVAL;
1106
1107 if (sfd->num_subflows || sfd->size_kernel)
1108 return -EINVAL;
1109
1110 return len - sfd->size_subflow_data;
1111 }
1112
mptcp_getsockopt_tcpinfo(struct mptcp_sock * msk,char __user * optval,int __user * optlen)1113 static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval,
1114 int __user *optlen)
1115 {
1116 struct mptcp_subflow_context *subflow;
1117 struct sock *sk = (struct sock *)msk;
1118 unsigned int sfcount = 0, copied = 0;
1119 struct mptcp_subflow_data sfd;
1120 char __user *infoptr;
1121 int len;
1122
1123 len = mptcp_get_subflow_data(&sfd, optval, optlen);
1124 if (len < 0)
1125 return len;
1126
1127 sfd.size_kernel = sizeof(struct tcp_info);
1128 sfd.size_user = min_t(unsigned int, sfd.size_user,
1129 sizeof(struct tcp_info));
1130
1131 infoptr = optval + sfd.size_subflow_data;
1132
1133 lock_sock(sk);
1134
1135 mptcp_for_each_subflow(msk, subflow) {
1136 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1137
1138 ++sfcount;
1139
1140 if (len && len >= sfd.size_user) {
1141 struct tcp_info info;
1142
1143 tcp_get_info(ssk, &info);
1144
1145 if (copy_to_user(infoptr, &info, sfd.size_user)) {
1146 release_sock(sk);
1147 return -EFAULT;
1148 }
1149
1150 infoptr += sfd.size_user;
1151 copied += sfd.size_user;
1152 len -= sfd.size_user;
1153 }
1154 }
1155
1156 release_sock(sk);
1157
1158 sfd.num_subflows = sfcount;
1159
1160 if (mptcp_put_subflow_data(&sfd, optval, copied, optlen))
1161 return -EFAULT;
1162
1163 return 0;
1164 }
1165
mptcp_get_sub_addrs(const struct sock * sk,struct mptcp_subflow_addrs * a)1166 static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a)
1167 {
1168 const struct inet_sock *inet = inet_sk(sk);
1169
1170 memset(a, 0, sizeof(*a));
1171
1172 if (sk->sk_family == AF_INET) {
1173 a->sin_local.sin_family = AF_INET;
1174 a->sin_local.sin_port = inet->inet_sport;
1175 a->sin_local.sin_addr.s_addr = inet->inet_rcv_saddr;
1176
1177 if (!a->sin_local.sin_addr.s_addr)
1178 a->sin_local.sin_addr.s_addr = inet->inet_saddr;
1179
1180 a->sin_remote.sin_family = AF_INET;
1181 a->sin_remote.sin_port = inet->inet_dport;
1182 a->sin_remote.sin_addr.s_addr = inet->inet_daddr;
1183 #if IS_ENABLED(CONFIG_IPV6)
1184 } else if (sk->sk_family == AF_INET6) {
1185 const struct ipv6_pinfo *np = inet6_sk(sk);
1186
1187 if (WARN_ON_ONCE(!np))
1188 return;
1189
1190 a->sin6_local.sin6_family = AF_INET6;
1191 a->sin6_local.sin6_port = inet->inet_sport;
1192
1193 if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
1194 a->sin6_local.sin6_addr = np->saddr;
1195 else
1196 a->sin6_local.sin6_addr = sk->sk_v6_rcv_saddr;
1197
1198 a->sin6_remote.sin6_family = AF_INET6;
1199 a->sin6_remote.sin6_port = inet->inet_dport;
1200 a->sin6_remote.sin6_addr = sk->sk_v6_daddr;
1201 #endif
1202 }
1203 }
1204
mptcp_getsockopt_subflow_addrs(struct mptcp_sock * msk,char __user * optval,int __user * optlen)1205 static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *optval,
1206 int __user *optlen)
1207 {
1208 struct mptcp_subflow_context *subflow;
1209 struct sock *sk = (struct sock *)msk;
1210 unsigned int sfcount = 0, copied = 0;
1211 struct mptcp_subflow_data sfd;
1212 char __user *addrptr;
1213 int len;
1214
1215 len = mptcp_get_subflow_data(&sfd, optval, optlen);
1216 if (len < 0)
1217 return len;
1218
1219 sfd.size_kernel = sizeof(struct mptcp_subflow_addrs);
1220 sfd.size_user = min_t(unsigned int, sfd.size_user,
1221 sizeof(struct mptcp_subflow_addrs));
1222
1223 addrptr = optval + sfd.size_subflow_data;
1224
1225 lock_sock(sk);
1226
1227 mptcp_for_each_subflow(msk, subflow) {
1228 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1229
1230 ++sfcount;
1231
1232 if (len && len >= sfd.size_user) {
1233 struct mptcp_subflow_addrs a;
1234
1235 mptcp_get_sub_addrs(ssk, &a);
1236
1237 if (copy_to_user(addrptr, &a, sfd.size_user)) {
1238 release_sock(sk);
1239 return -EFAULT;
1240 }
1241
1242 addrptr += sfd.size_user;
1243 copied += sfd.size_user;
1244 len -= sfd.size_user;
1245 }
1246 }
1247
1248 release_sock(sk);
1249
1250 sfd.num_subflows = sfcount;
1251
1252 if (mptcp_put_subflow_data(&sfd, optval, copied, optlen))
1253 return -EFAULT;
1254
1255 return 0;
1256 }
1257
mptcp_get_full_info(struct mptcp_full_info * mfi,char __user * optval,int __user * optlen)1258 static int mptcp_get_full_info(struct mptcp_full_info *mfi,
1259 char __user *optval,
1260 int __user *optlen)
1261 {
1262 int len;
1263
1264 BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) !=
1265 MIN_FULL_INFO_OPTLEN_SIZE);
1266
1267 if (get_user(len, optlen))
1268 return -EFAULT;
1269
1270 if (len < MIN_FULL_INFO_OPTLEN_SIZE)
1271 return -EINVAL;
1272
1273 memset(mfi, 0, sizeof(*mfi));
1274 if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE))
1275 return -EFAULT;
1276
1277 if (mfi->size_tcpinfo_kernel ||
1278 mfi->size_sfinfo_kernel ||
1279 mfi->num_subflows)
1280 return -EINVAL;
1281
1282 if (mfi->size_sfinfo_user > INT_MAX ||
1283 mfi->size_tcpinfo_user > INT_MAX)
1284 return -EINVAL;
1285
1286 return len - MIN_FULL_INFO_OPTLEN_SIZE;
1287 }
1288
mptcp_put_full_info(struct mptcp_full_info * mfi,char __user * optval,u32 copylen,int __user * optlen)1289 static int mptcp_put_full_info(struct mptcp_full_info *mfi,
1290 char __user *optval,
1291 u32 copylen,
1292 int __user *optlen)
1293 {
1294 copylen += MIN_FULL_INFO_OPTLEN_SIZE;
1295 if (put_user(copylen, optlen))
1296 return -EFAULT;
1297
1298 if (copy_to_user(optval, mfi, copylen))
1299 return -EFAULT;
1300 return 0;
1301 }
1302
mptcp_getsockopt_full_info(struct mptcp_sock * msk,char __user * optval,int __user * optlen)1303 static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval,
1304 int __user *optlen)
1305 {
1306 unsigned int sfcount = 0, copylen = 0;
1307 struct mptcp_subflow_context *subflow;
1308 struct sock *sk = (struct sock *)msk;
1309 void __user *tcpinfoptr, *sfinfoptr;
1310 struct mptcp_full_info mfi;
1311 int len;
1312
1313 len = mptcp_get_full_info(&mfi, optval, optlen);
1314 if (len < 0)
1315 return len;
1316
1317 /* don't bother filling the mptcp info if there is not enough
1318 * user-space-provided storage
1319 */
1320 if (len > 0) {
1321 mptcp_diag_fill_info(msk, &mfi.mptcp_info);
1322 copylen += min_t(unsigned int, len, sizeof(struct mptcp_info));
1323 }
1324
1325 mfi.size_tcpinfo_kernel = sizeof(struct tcp_info);
1326 mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user,
1327 sizeof(struct tcp_info));
1328 sfinfoptr = u64_to_user_ptr(mfi.subflow_info);
1329 mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info);
1330 mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user,
1331 sizeof(struct mptcp_subflow_info));
1332 tcpinfoptr = u64_to_user_ptr(mfi.tcp_info);
1333
1334 lock_sock(sk);
1335 mptcp_for_each_subflow(msk, subflow) {
1336 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1337 struct mptcp_subflow_info sfinfo;
1338 struct tcp_info tcp_info;
1339
1340 if (sfcount++ >= mfi.size_arrays_user)
1341 continue;
1342
1343 /* fetch addr/tcp_info only if the user space buffers
1344 * are wide enough
1345 */
1346 memset(&sfinfo, 0, sizeof(sfinfo));
1347 sfinfo.id = subflow->subflow_id;
1348 if (mfi.size_sfinfo_user >
1349 offsetof(struct mptcp_subflow_info, addrs))
1350 mptcp_get_sub_addrs(ssk, &sfinfo.addrs);
1351 if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user))
1352 goto fail_release;
1353
1354 if (mfi.size_tcpinfo_user) {
1355 tcp_get_info(ssk, &tcp_info);
1356 if (copy_to_user(tcpinfoptr, &tcp_info,
1357 mfi.size_tcpinfo_user))
1358 goto fail_release;
1359 }
1360
1361 tcpinfoptr += mfi.size_tcpinfo_user;
1362 sfinfoptr += mfi.size_sfinfo_user;
1363 }
1364 release_sock(sk);
1365
1366 mfi.num_subflows = sfcount;
1367 if (mptcp_put_full_info(&mfi, optval, copylen, optlen))
1368 return -EFAULT;
1369
1370 return 0;
1371
1372 fail_release:
1373 release_sock(sk);
1374 return -EFAULT;
1375 }
1376
mptcp_put_int_option(struct mptcp_sock * msk,char __user * optval,int __user * optlen,int val)1377 static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
1378 int __user *optlen, int val)
1379 {
1380 int len;
1381
1382 if (get_user(len, optlen))
1383 return -EFAULT;
1384 if (len < 0)
1385 return -EINVAL;
1386
1387 if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
1388 unsigned char ucval = (unsigned char)val;
1389
1390 len = 1;
1391 if (put_user(len, optlen))
1392 return -EFAULT;
1393 if (copy_to_user(optval, &ucval, 1))
1394 return -EFAULT;
1395 } else {
1396 len = min_t(unsigned int, len, sizeof(int));
1397 if (put_user(len, optlen))
1398 return -EFAULT;
1399 if (copy_to_user(optval, &val, len))
1400 return -EFAULT;
1401 }
1402
1403 return 0;
1404 }
1405
mptcp_getsockopt_sol_tcp(struct mptcp_sock * msk,int optname,char __user * optval,int __user * optlen)1406 static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
1407 char __user *optval, int __user *optlen)
1408 {
1409 struct sock *sk = (void *)msk;
1410
1411 switch (optname) {
1412 case TCP_ULP:
1413 case TCP_CONGESTION:
1414 case TCP_INFO:
1415 case TCP_CC_INFO:
1416 case TCP_DEFER_ACCEPT:
1417 case TCP_FASTOPEN:
1418 case TCP_FASTOPEN_CONNECT:
1419 case TCP_FASTOPEN_KEY:
1420 case TCP_FASTOPEN_NO_COOKIE:
1421 return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
1422 optval, optlen);
1423 case TCP_INQ:
1424 return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq);
1425 case TCP_CORK:
1426 return mptcp_put_int_option(msk, optval, optlen, msk->cork);
1427 case TCP_NODELAY:
1428 return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
1429 case TCP_KEEPIDLE:
1430 return mptcp_put_int_option(msk, optval, optlen,
1431 msk->keepalive_idle ? :
1432 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_time) / HZ);
1433 case TCP_KEEPINTVL:
1434 return mptcp_put_int_option(msk, optval, optlen,
1435 msk->keepalive_intvl ? :
1436 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_intvl) / HZ);
1437 case TCP_KEEPCNT:
1438 return mptcp_put_int_option(msk, optval, optlen,
1439 msk->keepalive_cnt ? :
1440 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_probes));
1441 case TCP_NOTSENT_LOWAT:
1442 return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
1443 case TCP_IS_MPTCP:
1444 return mptcp_put_int_option(msk, optval, optlen, 1);
1445 case TCP_MAXSEG:
1446 return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
1447 optval, optlen);
1448 }
1449 return -EOPNOTSUPP;
1450 }
1451
mptcp_getsockopt_v4(struct mptcp_sock * msk,int optname,char __user * optval,int __user * optlen)1452 static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
1453 char __user *optval, int __user *optlen)
1454 {
1455 struct sock *sk = (void *)msk;
1456
1457 switch (optname) {
1458 case IP_TOS:
1459 return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
1460 case IP_FREEBIND:
1461 return mptcp_put_int_option(msk, optval, optlen,
1462 inet_test_bit(FREEBIND, sk));
1463 case IP_TRANSPARENT:
1464 return mptcp_put_int_option(msk, optval, optlen,
1465 inet_test_bit(TRANSPARENT, sk));
1466 case IP_BIND_ADDRESS_NO_PORT:
1467 return mptcp_put_int_option(msk, optval, optlen,
1468 inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
1469 case IP_LOCAL_PORT_RANGE:
1470 return mptcp_put_int_option(msk, optval, optlen,
1471 READ_ONCE(inet_sk(sk)->local_port_range));
1472 }
1473
1474 return -EOPNOTSUPP;
1475 }
1476
mptcp_getsockopt_v6(struct mptcp_sock * msk,int optname,char __user * optval,int __user * optlen)1477 static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname,
1478 char __user *optval, int __user *optlen)
1479 {
1480 struct sock *sk = (void *)msk;
1481
1482 switch (optname) {
1483 case IPV6_V6ONLY:
1484 return mptcp_put_int_option(msk, optval, optlen,
1485 sk->sk_ipv6only);
1486 case IPV6_TRANSPARENT:
1487 return mptcp_put_int_option(msk, optval, optlen,
1488 inet_test_bit(TRANSPARENT, sk));
1489 case IPV6_FREEBIND:
1490 return mptcp_put_int_option(msk, optval, optlen,
1491 inet_test_bit(FREEBIND, sk));
1492 }
1493
1494 return -EOPNOTSUPP;
1495 }
1496
mptcp_getsockopt_sol_mptcp(struct mptcp_sock * msk,int optname,char __user * optval,int __user * optlen)1497 static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname,
1498 char __user *optval, int __user *optlen)
1499 {
1500 switch (optname) {
1501 case MPTCP_INFO:
1502 return mptcp_getsockopt_info(msk, optval, optlen);
1503 case MPTCP_FULL_INFO:
1504 return mptcp_getsockopt_full_info(msk, optval, optlen);
1505 case MPTCP_TCPINFO:
1506 return mptcp_getsockopt_tcpinfo(msk, optval, optlen);
1507 case MPTCP_SUBFLOW_ADDRS:
1508 return mptcp_getsockopt_subflow_addrs(msk, optval, optlen);
1509 }
1510
1511 return -EOPNOTSUPP;
1512 }
1513
mptcp_getsockopt(struct sock * sk,int level,int optname,char __user * optval,int __user * option)1514 int mptcp_getsockopt(struct sock *sk, int level, int optname,
1515 char __user *optval, int __user *option)
1516 {
1517 struct mptcp_sock *msk = mptcp_sk(sk);
1518 struct sock *ssk;
1519
1520 pr_debug("msk=%p\n", msk);
1521
1522 /* @@ the meaning of setsockopt() when the socket is connected and
1523 * there are multiple subflows is not yet defined. It is up to the
1524 * MPTCP-level socket to configure the subflows until the subflow
1525 * is in TCP fallback, when socket options are passed through
1526 * to the one remaining subflow.
1527 */
1528 lock_sock(sk);
1529 ssk = __mptcp_tcp_fallback(msk);
1530 release_sock(sk);
1531 if (ssk)
1532 return tcp_getsockopt(ssk, level, optname, optval, option);
1533
1534 if (level == SOL_IP)
1535 return mptcp_getsockopt_v4(msk, optname, optval, option);
1536 if (level == SOL_IPV6)
1537 return mptcp_getsockopt_v6(msk, optname, optval, option);
1538 if (level == SOL_TCP)
1539 return mptcp_getsockopt_sol_tcp(msk, optname, optval, option);
1540 if (level == SOL_MPTCP)
1541 return mptcp_getsockopt_sol_mptcp(msk, optname, optval, option);
1542 return -EOPNOTSUPP;
1543 }
1544
sync_socket_options(struct mptcp_sock * msk,struct sock * ssk)1545 static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
1546 {
1547 static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK;
1548 struct sock *sk = (struct sock *)msk;
1549 bool keep_open;
1550
1551 keep_open = sock_flag(sk, SOCK_KEEPOPEN);
1552 if (ssk->sk_prot->keepalive)
1553 ssk->sk_prot->keepalive(ssk, keep_open);
1554 sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open);
1555
1556 ssk->sk_priority = sk->sk_priority;
1557 ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
1558 ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
1559 ssk->sk_ipv6only = sk->sk_ipv6only;
1560 __ip_sock_set_tos(ssk, inet_sk(sk)->tos);
1561
1562 if (sk->sk_userlocks & tx_rx_locks) {
1563 ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
1564 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) {
1565 WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
1566 mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
1567 }
1568 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1569 __mptcp_subflow_set_rcvbuf(ssk, sk->sk_rcvbuf);
1570 }
1571
1572 if (sock_flag(sk, SOCK_LINGER)) {
1573 ssk->sk_lingertime = sk->sk_lingertime;
1574 sock_set_flag(ssk, SOCK_LINGER);
1575 } else {
1576 sock_reset_flag(ssk, SOCK_LINGER);
1577 }
1578
1579 if (sk->sk_mark != ssk->sk_mark) {
1580 ssk->sk_mark = sk->sk_mark;
1581 sk_dst_reset(ssk);
1582 }
1583
1584 sock_valbool_flag(ssk, SOCK_DBG, sock_flag(sk, SOCK_DBG));
1585
1586 if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops)
1587 tcp_set_congestion_control(ssk, msk->ca_name, false, true);
1588 __tcp_sock_set_cork(ssk, !!msk->cork);
1589 __tcp_sock_set_nodelay(ssk, !!msk->nodelay);
1590 tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle);
1591 tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl);
1592 tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt);
1593 tcp_sock_set_maxseg(ssk, msk->maxseg);
1594
1595 inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk));
1596 inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
1597 inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
1598 WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range));
1599 }
1600
mptcp_sockopt_sync_locked(struct mptcp_sock * msk,struct sock * ssk)1601 void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
1602 {
1603 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1604
1605 msk_owned_by_me(msk);
1606
1607 ssk->sk_rcvlowat = 0;
1608
1609 /* subflows must ignore any latency-related settings: will not affect
1610 * the user-space - only the msk is relevant - but will foul the
1611 * mptcp scheduler
1612 */
1613 tcp_sk(ssk)->notsent_lowat = UINT_MAX;
1614
1615 if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
1616 sync_socket_options(msk, ssk);
1617
1618 subflow->setsockopt_seq = msk->setsockopt_seq;
1619 }
1620 }
1621
1622 /* unfortunately this is different enough from the tcp version so
1623 * that we can't factor it out
1624 */
mptcp_set_rcvlowat(struct sock * sk,int val)1625 int mptcp_set_rcvlowat(struct sock *sk, int val)
1626 {
1627 struct mptcp_subflow_context *subflow;
1628 int space, cap;
1629
1630 /* bpf can land here with a wrong sk type */
1631 if (sk->sk_protocol == IPPROTO_TCP)
1632 return -EINVAL;
1633
1634 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1635 cap = sk->sk_rcvbuf >> 1;
1636 else
1637 cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
1638 val = min(val, cap);
1639 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1640
1641 /* Check if we need to signal EPOLLIN right now */
1642 if (mptcp_epollin_ready(sk))
1643 sk->sk_data_ready(sk);
1644
1645 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1646 return 0;
1647
1648 space = mptcp_space_from_win(sk, val);
1649 if (space <= sk->sk_rcvbuf)
1650 return 0;
1651
1652 /* propagate the rcvbuf changes to all the subflows */
1653 WRITE_ONCE(sk->sk_rcvbuf, space);
1654 mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
1655 struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1656 bool slow;
1657
1658 slow = lock_sock_fast(ssk);
1659 WRITE_ONCE(ssk->sk_rcvbuf, space);
1660 WRITE_ONCE(tcp_sk(ssk)->window_clamp, val);
1661 unlock_sock_fast(ssk, slow);
1662 }
1663 return 0;
1664 }
1665