xref: /linux/net/mptcp/ctrl.c (revision b693b51e0829b96a5c43f45c3fba3d11f6f09d2f)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2019, Tessares SA.
5  */
6 
7 #ifdef CONFIG_SYSCTL
8 #include <linux/sysctl.h>
9 #endif
10 
11 #include <net/net_namespace.h>
12 #include <net/netns/generic.h>
13 
14 #include "protocol.h"
15 #include "mib.h"
16 
17 #define MPTCP_SYSCTL_PATH "net/mptcp"
18 
19 static int mptcp_pernet_id;
20 
21 #ifdef CONFIG_SYSCTL
22 static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX;
23 #endif
24 
25 struct mptcp_pernet {
26 #ifdef CONFIG_SYSCTL
27 	struct ctl_table_header *ctl_table_hdr;
28 #endif
29 
30 	unsigned int add_addr_timeout;
31 	unsigned int blackhole_timeout;
32 	unsigned int close_timeout;
33 	unsigned int stale_loss_cnt;
34 	atomic_t active_disable_times;
35 	unsigned long active_disable_stamp;
36 	u8 syn_retrans_before_tcp_fallback;
37 	u8 mptcp_enabled;
38 	u8 checksum_enabled;
39 	u8 allow_join_initial_addr_port;
40 	u8 pm_type;
41 	u8 add_addr_v6_port_drop_ts;
42 	char scheduler[MPTCP_SCHED_NAME_MAX];
43 	char path_manager[MPTCP_PM_NAME_MAX];
44 };
45 
46 static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
47 {
48 	return net_generic(net, mptcp_pernet_id);
49 }
50 
51 int mptcp_is_enabled(const struct net *net)
52 {
53 	return mptcp_get_pernet(net)->mptcp_enabled;
54 }
55 
56 unsigned int mptcp_get_add_addr_timeout(const struct net *net)
57 {
58 	return mptcp_get_pernet(net)->add_addr_timeout;
59 }
60 
61 int mptcp_is_checksum_enabled(const struct net *net)
62 {
63 	return mptcp_get_pernet(net)->checksum_enabled;
64 }
65 
66 int mptcp_allow_join_id0(const struct net *net)
67 {
68 	return mptcp_get_pernet(net)->allow_join_initial_addr_port;
69 }
70 
71 unsigned int mptcp_stale_loss_cnt(const struct net *net)
72 {
73 	return mptcp_get_pernet(net)->stale_loss_cnt;
74 }
75 
76 unsigned int mptcp_close_timeout(const struct sock *sk)
77 {
78 	if (sock_flag(sk, SOCK_DEAD))
79 		return TCP_TIMEWAIT_LEN;
80 	return mptcp_get_pernet(sock_net(sk))->close_timeout;
81 }
82 
83 int mptcp_get_pm_type(const struct net *net)
84 {
85 	return mptcp_get_pernet(net)->pm_type;
86 }
87 
88 const char *mptcp_get_path_manager(const struct net *net)
89 {
90 	return mptcp_get_pernet(net)->path_manager;
91 }
92 
93 const char *mptcp_get_scheduler(const struct net *net)
94 {
95 	return mptcp_get_pernet(net)->scheduler;
96 }
97 
98 unsigned int mptcp_add_addr_v6_port_drop_ts(const struct net *net)
99 {
100 	return READ_ONCE(mptcp_get_pernet(net)->add_addr_v6_port_drop_ts);
101 }
102 
103 static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
104 {
105 	pernet->mptcp_enabled = 1;
106 	pernet->add_addr_timeout = TCP_RTO_MAX;
107 	pernet->blackhole_timeout = 3600;
108 	pernet->syn_retrans_before_tcp_fallback = 2;
109 	atomic_set(&pernet->active_disable_times, 0);
110 	pernet->close_timeout = TCP_TIMEWAIT_LEN;
111 	pernet->checksum_enabled = 0;
112 	pernet->allow_join_initial_addr_port = 1;
113 	pernet->stale_loss_cnt = 4;
114 	pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
115 	strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler));
116 	strscpy(pernet->path_manager, "kernel", sizeof(pernet->path_manager));
117 	pernet->add_addr_v6_port_drop_ts = 1;
118 }
119 
120 #ifdef CONFIG_SYSCTL
121 static int mptcp_set_scheduler(char *scheduler, const char *name)
122 {
123 	struct mptcp_sched_ops *sched;
124 	int ret = 0;
125 
126 	rcu_read_lock();
127 	sched = mptcp_sched_find(name);
128 	if (sched)
129 		strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX);
130 	else
131 		ret = -ENOENT;
132 	rcu_read_unlock();
133 
134 	return ret;
135 }
136 
137 static int proc_scheduler(const struct ctl_table *ctl, int write,
138 			  void *buffer, size_t *lenp, loff_t *ppos)
139 {
140 	char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data;
141 	char val[MPTCP_SCHED_NAME_MAX];
142 	struct ctl_table tbl = {
143 		.data = val,
144 		.maxlen = MPTCP_SCHED_NAME_MAX,
145 	};
146 	int ret;
147 
148 	strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX);
149 
150 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
151 	if (write && ret == 0)
152 		ret = mptcp_set_scheduler(*scheduler, val);
153 
154 	return ret;
155 }
156 
157 static int proc_available_schedulers(const struct ctl_table *ctl,
158 				     int write, void *buffer,
159 				     size_t *lenp, loff_t *ppos)
160 {
161 	struct ctl_table tbl = { .maxlen = MPTCP_SCHED_BUF_MAX, };
162 	int ret;
163 
164 	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
165 	if (!tbl.data)
166 		return -ENOMEM;
167 
168 	mptcp_get_available_schedulers(tbl.data, MPTCP_SCHED_BUF_MAX);
169 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
170 	kfree(tbl.data);
171 
172 	return ret;
173 }
174 
175 static int proc_blackhole_detect_timeout(const struct ctl_table *table,
176 					 int write, void *buffer, size_t *lenp,
177 					 loff_t *ppos)
178 {
179 	struct mptcp_pernet *pernet = container_of(table->data,
180 						   struct mptcp_pernet,
181 						   blackhole_timeout);
182 	int ret;
183 
184 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
185 	if (write && ret == 0)
186 		atomic_set(&pernet->active_disable_times, 0);
187 
188 	return ret;
189 }
190 
191 static int mptcp_set_path_manager(char *path_manager, const char *name)
192 {
193 	struct mptcp_pm_ops *pm_ops;
194 	int ret = 0;
195 
196 	rcu_read_lock();
197 	pm_ops = mptcp_pm_find(name);
198 	if (pm_ops)
199 		strscpy(path_manager, name, MPTCP_PM_NAME_MAX);
200 	else
201 		ret = -ENOENT;
202 	rcu_read_unlock();
203 
204 	return ret;
205 }
206 
207 static int proc_path_manager(const struct ctl_table *ctl, int write,
208 			     void *buffer, size_t *lenp, loff_t *ppos)
209 {
210 	struct mptcp_pernet *pernet = container_of(ctl->data,
211 						   struct mptcp_pernet,
212 						   path_manager);
213 	char (*path_manager)[MPTCP_PM_NAME_MAX] = ctl->data;
214 	char pm_name[MPTCP_PM_NAME_MAX];
215 	const struct ctl_table tbl = {
216 		.data = pm_name,
217 		.maxlen = MPTCP_PM_NAME_MAX,
218 	};
219 	int ret;
220 
221 	strscpy(pm_name, *path_manager, MPTCP_PM_NAME_MAX);
222 
223 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
224 	if (write && ret == 0) {
225 		ret = mptcp_set_path_manager(*path_manager, pm_name);
226 		if (ret == 0) {
227 			u8 pm_type = __MPTCP_PM_TYPE_NR;
228 
229 			if (strncmp(pm_name, "kernel", MPTCP_PM_NAME_MAX) == 0)
230 				pm_type = MPTCP_PM_TYPE_KERNEL;
231 			else if (strncmp(pm_name, "userspace", MPTCP_PM_NAME_MAX) == 0)
232 				pm_type = MPTCP_PM_TYPE_USERSPACE;
233 			pernet->pm_type = pm_type;
234 		}
235 	}
236 
237 	return ret;
238 }
239 
240 static int proc_pm_type(const struct ctl_table *ctl, int write,
241 			void *buffer, size_t *lenp, loff_t *ppos)
242 {
243 	struct mptcp_pernet *pernet = container_of(ctl->data,
244 						   struct mptcp_pernet,
245 						   pm_type);
246 	int ret;
247 
248 	ret = proc_dou8vec_minmax(ctl, write, buffer, lenp, ppos);
249 	if (write && ret == 0) {
250 		u8 pm_type = READ_ONCE(*(u8 *)ctl->data);
251 		char *pm_name = "";
252 
253 		if (pm_type == MPTCP_PM_TYPE_KERNEL)
254 			pm_name = "kernel";
255 		else if (pm_type == MPTCP_PM_TYPE_USERSPACE)
256 			pm_name = "userspace";
257 		mptcp_set_path_manager(pernet->path_manager, pm_name);
258 	}
259 
260 	return ret;
261 }
262 
263 static int proc_available_path_managers(const struct ctl_table *ctl,
264 					int write, void *buffer,
265 					size_t *lenp, loff_t *ppos)
266 {
267 	struct ctl_table tbl = { .maxlen = MPTCP_PM_BUF_MAX, };
268 	int ret;
269 
270 	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
271 	if (!tbl.data)
272 		return -ENOMEM;
273 
274 	mptcp_pm_get_available(tbl.data, MPTCP_PM_BUF_MAX);
275 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
276 	kfree(tbl.data);
277 
278 	return ret;
279 }
280 
281 static struct ctl_table mptcp_sysctl_table[] = {
282 	{
283 		.procname = "enabled",
284 		.maxlen = sizeof(u8),
285 		.mode = 0644,
286 		/* users with CAP_NET_ADMIN or root (not and) can change this
287 		 * value, same as other sysctl or the 'net' tree.
288 		 */
289 		.proc_handler = proc_dou8vec_minmax,
290 		.extra1       = SYSCTL_ZERO,
291 		.extra2       = SYSCTL_ONE
292 	},
293 	{
294 		.procname = "add_addr_timeout",
295 		.maxlen = sizeof(unsigned int),
296 		.mode = 0644,
297 		.proc_handler = proc_dointvec_jiffies,
298 	},
299 	{
300 		.procname = "checksum_enabled",
301 		.maxlen = sizeof(u8),
302 		.mode = 0644,
303 		.proc_handler = proc_dou8vec_minmax,
304 		.extra1       = SYSCTL_ZERO,
305 		.extra2       = SYSCTL_ONE
306 	},
307 	{
308 		.procname = "allow_join_initial_addr_port",
309 		.maxlen = sizeof(u8),
310 		.mode = 0644,
311 		.proc_handler = proc_dou8vec_minmax,
312 		.extra1       = SYSCTL_ZERO,
313 		.extra2       = SYSCTL_ONE
314 	},
315 	{
316 		.procname = "stale_loss_cnt",
317 		.maxlen = sizeof(unsigned int),
318 		.mode = 0644,
319 		.proc_handler = proc_douintvec_minmax,
320 	},
321 	{
322 		.procname = "pm_type",
323 		.maxlen = sizeof(u8),
324 		.mode = 0644,
325 		.proc_handler = proc_pm_type,
326 		.extra1       = SYSCTL_ZERO,
327 		.extra2       = &mptcp_pm_type_max
328 	},
329 	{
330 		.procname = "scheduler",
331 		.maxlen	= MPTCP_SCHED_NAME_MAX,
332 		.mode = 0644,
333 		.proc_handler = proc_scheduler,
334 	},
335 	{
336 		.procname = "available_schedulers",
337 		.maxlen	= MPTCP_SCHED_BUF_MAX,
338 		.mode = 0444,
339 		.proc_handler = proc_available_schedulers,
340 	},
341 	{
342 		.procname = "close_timeout",
343 		.maxlen = sizeof(unsigned int),
344 		.mode = 0644,
345 		.proc_handler = proc_dointvec_jiffies,
346 	},
347 	{
348 		.procname = "blackhole_timeout",
349 		.maxlen = sizeof(unsigned int),
350 		.mode = 0644,
351 		.proc_handler = proc_blackhole_detect_timeout,
352 		.extra1 = SYSCTL_ZERO,
353 	},
354 	{
355 		.procname = "syn_retrans_before_tcp_fallback",
356 		.maxlen = sizeof(u8),
357 		.mode = 0644,
358 		.proc_handler = proc_dou8vec_minmax,
359 	},
360 	{
361 		.procname = "path_manager",
362 		.maxlen	= MPTCP_PM_NAME_MAX,
363 		.mode = 0644,
364 		.proc_handler = proc_path_manager,
365 	},
366 	{
367 		.procname = "available_path_managers",
368 		.maxlen	= MPTCP_PM_BUF_MAX,
369 		.mode = 0444,
370 		.proc_handler = proc_available_path_managers,
371 	},
372 	{
373 		.procname = "add_addr_v6_port_drop_ts",
374 		.maxlen = sizeof(u8),
375 		.mode = 0644,
376 		.proc_handler = proc_dou8vec_minmax,
377 		.extra1       = SYSCTL_ZERO,
378 		.extra2       = SYSCTL_ONE
379 	},
380 };
381 
382 static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
383 {
384 	struct ctl_table_header *hdr;
385 	struct ctl_table *table;
386 
387 	table = mptcp_sysctl_table;
388 	if (!net_eq(net, &init_net)) {
389 		table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL);
390 		if (!table)
391 			goto err_alloc;
392 	}
393 
394 	table[0].data = &pernet->mptcp_enabled;
395 	table[1].data = &pernet->add_addr_timeout;
396 	table[2].data = &pernet->checksum_enabled;
397 	table[3].data = &pernet->allow_join_initial_addr_port;
398 	table[4].data = &pernet->stale_loss_cnt;
399 	table[5].data = &pernet->pm_type;
400 	table[6].data = &pernet->scheduler;
401 	/* table[7] is for available_schedulers which is read-only info */
402 	table[8].data = &pernet->close_timeout;
403 	table[9].data = &pernet->blackhole_timeout;
404 	table[10].data = &pernet->syn_retrans_before_tcp_fallback;
405 	table[11].data = &pernet->path_manager;
406 	/* table[12] is for available_path_managers which is read-only info */
407 	table[13].data = &pernet->add_addr_v6_port_drop_ts;
408 
409 	hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
410 				     ARRAY_SIZE(mptcp_sysctl_table));
411 	if (!hdr)
412 		goto err_reg;
413 
414 	pernet->ctl_table_hdr = hdr;
415 
416 	return 0;
417 
418 err_reg:
419 	if (!net_eq(net, &init_net))
420 		kfree(table);
421 err_alloc:
422 	return -ENOMEM;
423 }
424 
425 static void mptcp_pernet_del_table(struct mptcp_pernet *pernet)
426 {
427 	const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;
428 
429 	unregister_net_sysctl_table(pernet->ctl_table_hdr);
430 
431 	kfree(table);
432 }
433 
434 #else
435 
436 static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
437 {
438 	return 0;
439 }
440 
441 static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {}
442 
443 #endif /* CONFIG_SYSCTL */
444 
445 /* The following code block is to deal with middle box issues with MPTCP,
446  * similar to what is done with TFO.
447  * The proposed solution is to disable active MPTCP globally when SYN+MPC are
448  * dropped, while SYN without MPC aren't. In this case, active side MPTCP is
449  * disabled globally for 1hr at first. Then if it happens again, it is disabled
450  * for 2h, then 4h, 8h, ...
451  * The timeout is reset back to 1hr when a successful active MPTCP connection is
452  * fully established.
453  */
454 
455 /* Disable active MPTCP and record current jiffies and active_disable_times */
456 void mptcp_active_disable(struct sock *sk)
457 {
458 	struct net *net = sock_net(sk);
459 	struct mptcp_pernet *pernet;
460 
461 	pernet = mptcp_get_pernet(net);
462 
463 	if (!READ_ONCE(pernet->blackhole_timeout))
464 		return;
465 
466 	/* Paired with READ_ONCE() in mptcp_active_should_disable() */
467 	WRITE_ONCE(pernet->active_disable_stamp, jiffies);
468 
469 	/* Paired with smp_rmb() in mptcp_active_should_disable().
470 	 * We want pernet->active_disable_stamp to be updated first.
471 	 */
472 	smp_mb__before_atomic();
473 	atomic_inc(&pernet->active_disable_times);
474 
475 	MPTCP_INC_STATS(net, MPTCP_MIB_BLACKHOLE);
476 }
477 
478 /* Calculate timeout for MPTCP active disable
479  * Return true if we are still in the active MPTCP disable period
480  * Return false if timeout already expired and we should use active MPTCP
481  */
482 bool mptcp_active_should_disable(struct sock *ssk)
483 {
484 	struct net *net = sock_net(ssk);
485 	unsigned int blackhole_timeout;
486 	struct mptcp_pernet *pernet;
487 	unsigned long timeout;
488 	int disable_times;
489 	int multiplier;
490 
491 	pernet = mptcp_get_pernet(net);
492 	blackhole_timeout = READ_ONCE(pernet->blackhole_timeout);
493 
494 	if (!blackhole_timeout)
495 		return false;
496 
497 	disable_times = atomic_read(&pernet->active_disable_times);
498 	if (!disable_times)
499 		return false;
500 
501 	/* Paired with smp_mb__before_atomic() in mptcp_active_disable() */
502 	smp_rmb();
503 
504 	/* Limit timeout to max: 2^6 * initial timeout */
505 	multiplier = 1 << min(disable_times - 1, 6);
506 
507 	/* Paired with the WRITE_ONCE() in mptcp_active_disable(). */
508 	timeout = READ_ONCE(pernet->active_disable_stamp) +
509 		  multiplier * blackhole_timeout * HZ;
510 
511 	return time_before(jiffies, timeout);
512 }
513 
514 /* Enable active MPTCP and reset active_disable_times if needed */
515 void mptcp_active_enable(struct sock *sk)
516 {
517 	struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk));
518 
519 	if (atomic_read(&pernet->active_disable_times)) {
520 		struct net_device *dev;
521 		struct dst_entry *dst;
522 
523 		rcu_read_lock();
524 		dst = __sk_dst_get(sk);
525 		dev = dst ? dst_dev_rcu(dst) : NULL;
526 		if (!(dev && (dev->flags & IFF_LOOPBACK)))
527 			atomic_set(&pernet->active_disable_times, 0);
528 		rcu_read_unlock();
529 	}
530 }
531 
532 /* Check the number of retransmissions, and fallback to TCP if needed */
533 void mptcp_active_detect_blackhole(struct sock *ssk, bool expired)
534 {
535 	struct mptcp_subflow_context *subflow;
536 	u8 timeouts, to_max;
537 	struct net *net;
538 
539 	/* Only check MPTCP SYN ... */
540 	if (likely(!sk_is_mptcp(ssk) || ssk->sk_state != TCP_SYN_SENT))
541 		return;
542 
543 	subflow = mptcp_subflow_ctx(ssk);
544 
545 	/* ... + MP_CAPABLE */
546 	if (!subflow->request_mptcp) {
547 		/* Mark as blackhole iif the 1st non-MPTCP SYN is accepted */
548 		subflow->mpc_drop = 0;
549 		return;
550 	}
551 
552 	net = sock_net(ssk);
553 	timeouts = inet_csk(ssk)->icsk_retransmits;
554 	to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback;
555 
556 	if (timeouts == to_max || (timeouts < to_max && expired)) {
557 		subflow->mpc_drop = 1;
558 		mptcp_early_fallback(mptcp_sk(subflow->conn), subflow,
559 				     MPTCP_MIB_MPCAPABLEACTIVEDROP);
560 	}
561 }
562 
563 static int __net_init mptcp_net_init(struct net *net)
564 {
565 	struct mptcp_pernet *pernet = mptcp_get_pernet(net);
566 
567 	mptcp_pernet_set_defaults(pernet);
568 
569 	return mptcp_pernet_new_table(net, pernet);
570 }
571 
572 /* Note: the callback will only be called per extra netns */
573 static void __net_exit mptcp_net_exit(struct net *net)
574 {
575 	struct mptcp_pernet *pernet = mptcp_get_pernet(net);
576 
577 	mptcp_pernet_del_table(pernet);
578 }
579 
580 static struct pernet_operations mptcp_pernet_ops = {
581 	.init = mptcp_net_init,
582 	.exit = mptcp_net_exit,
583 	.id = &mptcp_pernet_id,
584 	.size = sizeof(struct mptcp_pernet),
585 };
586 
587 void __init mptcp_init(void)
588 {
589 	mptcp_join_cookie_init();
590 	mptcp_proto_init();
591 
592 	if (register_pernet_subsys(&mptcp_pernet_ops) < 0)
593 		panic("Failed to register MPTCP pernet subsystem.\n");
594 }
595 
596 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
597 int __init mptcpv6_init(void)
598 {
599 	int err;
600 
601 	err = mptcp_proto_v6_init();
602 
603 	return err;
604 }
605 #endif
606