xref: /linux/net/core/sysctl_net_core.c (revision 3da35aa8af345abf6cd180cfc0538c753b927a18)
1 // SPDX-License-Identifier: GPL-2.0
2 /* -*- linux-c -*-
3  * sysctl_net_core.c: sysctl interface to net core subsystem.
4  *
5  * Begun April 1, 1996, Mike Shaver.
6  * Added /proc/sys/net/core directory entry (empty =) ). [MS]
7  */
8 
9 #include <linux/filter.h>
10 #include <linux/mm.h>
11 #include <linux/sysctl.h>
12 #include <linux/module.h>
13 #include <linux/socket.h>
14 #include <linux/netdevice.h>
15 #include <linux/ratelimit.h>
16 #include <linux/vmalloc.h>
17 #include <linux/init.h>
18 #include <linux/slab.h>
19 #include <linux/sched/isolation.h>
20 #include <linux/hex.h>
21 
22 #include <net/ip.h>
23 #include <net/sock.h>
24 #include <net/net_ratelimit.h>
25 #include <net/busy_poll.h>
26 #include <net/pkt_sched.h>
27 #include <net/hotdata.h>
28 #include <net/proto_memory.h>
29 #include <net/rps.h>
30 
31 #include "dev.h"
32 #include "net-sysfs.h"
33 
34 static int int_3600 = 3600;
35 static int min_sndbuf = SOCK_MIN_SNDBUF;
36 static int min_rcvbuf = SOCK_MIN_RCVBUF;
37 static int max_skb_frags = MAX_SKB_FRAGS;
38 static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE;
39 static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ;
40 
41 static int net_msg_warn;	/* Unused, but still a sysctl */
42 
43 int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
44 EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
45 
46 /* 0 - Keep current behavior:
47  *     IPv4: inherit all current settings from init_net
48  *     IPv6: reset all settings to default
49  * 1 - Both inherit all current settings from init_net
50  * 2 - Both reset all settings to default
51  * 3 - Both inherit all settings from current netns
52  */
53 int sysctl_devconf_inherit_init_net __read_mostly;
54 EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
55 
56 #if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
57 static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
58 			struct cpumask *mask)
59 {
60 	char *kbuf;
61 	int len;
62 
63 	if (*ppos || !*lenp) {
64 		*lenp = 0;
65 		return 0;
66 	}
67 
68 	/* CPUs are displayed as a hex bitmap + a comma between each groups of 8
69 	 * nibbles (except the last one which has a newline instead).
70 	 * Guesstimate the buffer size at the group granularity level.
71 	 */
72 	len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp);
73 	kbuf = kmalloc(len, GFP_KERNEL);
74 	if (!kbuf) {
75 		*lenp = 0;
76 		return -ENOMEM;
77 	}
78 
79 	len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
80 	if (!len) {
81 		*lenp = 0;
82 		goto free_buf;
83 	}
84 
85 	/* scnprintf writes a trailing null char not counted in the returned
86 	 * length, override it with a newline.
87 	 */
88 	kbuf[len++] = '\n';
89 	memcpy(buffer, kbuf, len);
90 	*lenp = len;
91 	*ppos += len;
92 
93 free_buf:
94 	kfree(kbuf);
95 	return 0;
96 }
97 #endif
98 
99 #ifdef CONFIG_RPS
100 
101 DEFINE_MUTEX(rps_default_mask_mutex);
102 
103 static int rps_default_mask_sysctl(const struct ctl_table *table, int write,
104 				   void *buffer, size_t *lenp, loff_t *ppos)
105 {
106 	struct net *net = (struct net *)table->data;
107 	struct cpumask *mask;
108 	int err = 0;
109 
110 	mutex_lock(&rps_default_mask_mutex);
111 	mask = net->core.rps_default_mask;
112 	if (write) {
113 		if (!mask) {
114 			mask = kzalloc(cpumask_size(), GFP_KERNEL);
115 			net->core.rps_default_mask = mask;
116 		}
117 		err = -ENOMEM;
118 		if (!mask)
119 			goto done;
120 
121 		err = cpumask_parse(buffer, mask);
122 		if (err)
123 			goto done;
124 
125 		err = rps_cpumask_housekeeping(mask);
126 		if (err)
127 			goto done;
128 	} else {
129 		err = dump_cpumask(buffer, lenp, ppos,
130 				   mask ?: cpu_none_mask);
131 	}
132 
133 done:
134 	mutex_unlock(&rps_default_mask_mutex);
135 	return err;
136 }
137 
138 static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
139 				void *buffer, size_t *lenp, loff_t *ppos)
140 {
141 	unsigned int orig_size, size;
142 	int ret, i;
143 	struct ctl_table tmp = {
144 		.data = &size,
145 		.maxlen = sizeof(size),
146 		.mode = table->mode
147 	};
148 	struct rps_sock_flow_table *orig_sock_table, *sock_table;
149 	static DEFINE_MUTEX(sock_flow_mutex);
150 
151 	mutex_lock(&sock_flow_mutex);
152 
153 	orig_sock_table = rcu_dereference_protected(
154 					net_hotdata.rps_sock_flow_table,
155 					lockdep_is_held(&sock_flow_mutex));
156 	size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
157 
158 	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
159 
160 	if (write) {
161 		if (size) {
162 			if (size > 1<<29) {
163 				/* Enforce limit to prevent overflow */
164 				mutex_unlock(&sock_flow_mutex);
165 				return -EINVAL;
166 			}
167 			size = roundup_pow_of_two(size);
168 			if (size != orig_size) {
169 				sock_table =
170 				    vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
171 				if (!sock_table) {
172 					mutex_unlock(&sock_flow_mutex);
173 					return -ENOMEM;
174 				}
175 				net_hotdata.rps_cpu_mask =
176 					roundup_pow_of_two(nr_cpu_ids) - 1;
177 				sock_table->mask = size - 1;
178 			} else
179 				sock_table = orig_sock_table;
180 
181 			for (i = 0; i < size; i++)
182 				sock_table->ents[i] = RPS_NO_CPU;
183 		} else
184 			sock_table = NULL;
185 
186 		if (sock_table != orig_sock_table) {
187 			rcu_assign_pointer(net_hotdata.rps_sock_flow_table,
188 					   sock_table);
189 			if (sock_table) {
190 				static_branch_inc(&rps_needed);
191 				static_branch_inc(&rfs_needed);
192 			}
193 			if (orig_sock_table) {
194 				static_branch_dec(&rps_needed);
195 				static_branch_dec(&rfs_needed);
196 				kvfree_rcu(orig_sock_table, rcu);
197 			}
198 		}
199 	}
200 
201 	mutex_unlock(&sock_flow_mutex);
202 
203 	return ret;
204 }
205 #endif /* CONFIG_RPS */
206 
207 #ifdef CONFIG_NET_FLOW_LIMIT
208 static DEFINE_MUTEX(flow_limit_update_mutex);
209 
210 static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write,
211 				 void *buffer, size_t *lenp, loff_t *ppos)
212 {
213 	struct sd_flow_limit *cur;
214 	struct softnet_data *sd;
215 	cpumask_var_t mask;
216 	int i, len, ret = 0;
217 
218 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
219 		return -ENOMEM;
220 
221 	if (write) {
222 		ret = cpumask_parse(buffer, mask);
223 		if (ret)
224 			goto done;
225 
226 		mutex_lock(&flow_limit_update_mutex);
227 		len = sizeof(*cur) + netdev_flow_limit_table_len;
228 		for_each_possible_cpu(i) {
229 			sd = &per_cpu(softnet_data, i);
230 			cur = rcu_dereference_protected(sd->flow_limit,
231 				     lockdep_is_held(&flow_limit_update_mutex));
232 			if (cur && !cpumask_test_cpu(i, mask)) {
233 				RCU_INIT_POINTER(sd->flow_limit, NULL);
234 				kfree_rcu(cur, rcu);
235 			} else if (!cur && cpumask_test_cpu(i, mask)) {
236 				cur = kzalloc_node(len, GFP_KERNEL,
237 						   cpu_to_node(i));
238 				if (!cur) {
239 					/* not unwinding previous changes */
240 					ret = -ENOMEM;
241 					goto write_unlock;
242 				}
243 				cur->log_buckets = ilog2(netdev_flow_limit_table_len);
244 				rcu_assign_pointer(sd->flow_limit, cur);
245 			}
246 		}
247 write_unlock:
248 		mutex_unlock(&flow_limit_update_mutex);
249 	} else {
250 		cpumask_clear(mask);
251 		rcu_read_lock();
252 		for_each_possible_cpu(i) {
253 			sd = &per_cpu(softnet_data, i);
254 			if (rcu_dereference(sd->flow_limit))
255 				cpumask_set_cpu(i, mask);
256 		}
257 		rcu_read_unlock();
258 
259 		ret = dump_cpumask(buffer, lenp, ppos, mask);
260 	}
261 
262 done:
263 	free_cpumask_var(mask);
264 	return ret;
265 }
266 
267 static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write,
268 				       void *buffer, size_t *lenp, loff_t *ppos)
269 {
270 	unsigned int old, *ptr;
271 	int ret;
272 
273 	mutex_lock(&flow_limit_update_mutex);
274 
275 	ptr = table->data;
276 	old = *ptr;
277 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
278 	if (!ret && write && !is_power_of_2(*ptr)) {
279 		*ptr = old;
280 		ret = -EINVAL;
281 	}
282 
283 	mutex_unlock(&flow_limit_update_mutex);
284 	return ret;
285 }
286 #endif /* CONFIG_NET_FLOW_LIMIT */
287 
288 #ifdef CONFIG_NET_SCHED
289 static int set_default_qdisc(const struct ctl_table *table, int write,
290 			     void *buffer, size_t *lenp, loff_t *ppos)
291 {
292 	char id[IFNAMSIZ];
293 	struct ctl_table tbl = {
294 		.data = id,
295 		.maxlen = IFNAMSIZ,
296 	};
297 	int ret;
298 
299 	qdisc_get_default(id, IFNAMSIZ);
300 
301 	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
302 	if (write && ret == 0)
303 		ret = qdisc_set_default(id);
304 	return ret;
305 }
306 #endif
307 
308 static int proc_do_dev_weight(const struct ctl_table *table, int write,
309 			   void *buffer, size_t *lenp, loff_t *ppos)
310 {
311 	static DEFINE_MUTEX(dev_weight_mutex);
312 	int ret, weight;
313 
314 	mutex_lock(&dev_weight_mutex);
315 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
316 	if (!ret && write) {
317 		weight = READ_ONCE(weight_p);
318 		WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias);
319 		WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias);
320 	}
321 	mutex_unlock(&dev_weight_mutex);
322 
323 	return ret;
324 }
325 
326 static int proc_do_rss_key(const struct ctl_table *table, int write,
327 			   void *buffer, size_t *lenp, loff_t *ppos)
328 {
329 	char buf[NETDEV_RSS_KEY_LEN * 3];
330 	struct ctl_table fake_table;
331 	char *pos = buf;
332 
333 	for (int i = 0; i < NETDEV_RSS_KEY_LEN; i++) {
334 		pos = hex_byte_pack(pos, netdev_rss_key[i]);
335 		*pos++ = ':';
336 	}
337 	*(--pos) = 0;
338 
339 	fake_table.data = buf;
340 	fake_table.maxlen = sizeof(buf);
341 	return proc_dostring(&fake_table, write, buffer, lenp, ppos);
342 }
343 
344 #ifdef CONFIG_BPF_JIT
345 static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write,
346 					   void *buffer, size_t *lenp,
347 					   loff_t *ppos)
348 {
349 	int ret, jit_enable = *(int *)table->data;
350 	int min = *(int *)table->extra1;
351 	int max = *(int *)table->extra2;
352 	struct ctl_table tmp = *table;
353 
354 	if (write && !capable(CAP_SYS_ADMIN))
355 		return -EPERM;
356 
357 	tmp.data = &jit_enable;
358 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
359 	if (write && !ret) {
360 		if (jit_enable < 2 ||
361 		    (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) {
362 			*(int *)table->data = jit_enable;
363 			if (jit_enable == 2)
364 				pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
365 		} else {
366 			ret = -EPERM;
367 		}
368 	}
369 
370 	if (write && ret && min == max)
371 		pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n");
372 
373 	return ret;
374 }
375 
376 # ifdef CONFIG_HAVE_EBPF_JIT
377 static int
378 proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write,
379 				    void *buffer, size_t *lenp, loff_t *ppos)
380 {
381 	if (!capable(CAP_SYS_ADMIN))
382 		return -EPERM;
383 
384 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
385 }
386 # endif /* CONFIG_HAVE_EBPF_JIT */
387 
388 static int
389 proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write,
390 				     void *buffer, size_t *lenp, loff_t *ppos)
391 {
392 	if (!capable(CAP_SYS_ADMIN))
393 		return -EPERM;
394 
395 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
396 }
397 #endif
398 
399 static struct ctl_table net_core_table[] = {
400 	{
401 		.procname	= "mem_pcpu_rsv",
402 		.data		= &net_hotdata.sysctl_mem_pcpu_rsv,
403 		.maxlen		= sizeof(int),
404 		.mode		= 0644,
405 		.proc_handler	= proc_dointvec_minmax,
406 		.extra1		= &min_mem_pcpu_rsv,
407 	},
408 	{
409 		.procname	= "dev_weight",
410 		.data		= &weight_p,
411 		.maxlen		= sizeof(int),
412 		.mode		= 0644,
413 		.proc_handler	= proc_do_dev_weight,
414 		.extra1         = SYSCTL_ONE,
415 	},
416 	{
417 		.procname	= "dev_weight_rx_bias",
418 		.data		= &dev_weight_rx_bias,
419 		.maxlen		= sizeof(int),
420 		.mode		= 0644,
421 		.proc_handler	= proc_do_dev_weight,
422 		.extra1         = SYSCTL_ONE,
423 	},
424 	{
425 		.procname	= "dev_weight_tx_bias",
426 		.data		= &dev_weight_tx_bias,
427 		.maxlen		= sizeof(int),
428 		.mode		= 0644,
429 		.proc_handler	= proc_do_dev_weight,
430 		.extra1         = SYSCTL_ONE,
431 	},
432 	{
433 		.procname	= "netdev_max_backlog",
434 		.data		= &net_hotdata.max_backlog,
435 		.maxlen		= sizeof(int),
436 		.mode		= 0644,
437 		.proc_handler	= proc_dointvec
438 	},
439 	{
440 		.procname	= "qdisc_max_burst",
441 		.data		= &net_hotdata.qdisc_max_burst,
442 		.maxlen		= sizeof(int),
443 		.mode		= 0644,
444 		.proc_handler	= proc_dointvec
445 	},
446 	{
447 		.procname	= "netdev_rss_key",
448 		.data		= &netdev_rss_key,
449 		.maxlen		= sizeof(int),
450 		.mode		= 0444,
451 		.proc_handler	= proc_do_rss_key,
452 	},
453 #ifdef CONFIG_BPF_JIT
454 	{
455 		.procname	= "bpf_jit_enable",
456 		.data		= &bpf_jit_enable,
457 		.maxlen		= sizeof(int),
458 		.mode		= 0644,
459 		.proc_handler	= proc_dointvec_minmax_bpf_enable,
460 # ifdef CONFIG_BPF_JIT_ALWAYS_ON
461 		.extra1		= SYSCTL_ONE,
462 		.extra2		= SYSCTL_ONE,
463 # else
464 		.extra1		= SYSCTL_ZERO,
465 		.extra2		= SYSCTL_TWO,
466 # endif
467 	},
468 # ifdef CONFIG_HAVE_EBPF_JIT
469 	{
470 		.procname	= "bpf_jit_harden",
471 		.data		= &bpf_jit_harden,
472 		.maxlen		= sizeof(int),
473 		.mode		= 0600,
474 		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
475 		.extra1		= SYSCTL_ZERO,
476 		.extra2		= SYSCTL_TWO,
477 	},
478 	{
479 		.procname	= "bpf_jit_kallsyms",
480 		.data		= &bpf_jit_kallsyms,
481 		.maxlen		= sizeof(int),
482 		.mode		= 0600,
483 		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
484 		.extra1		= SYSCTL_ZERO,
485 		.extra2		= SYSCTL_ONE,
486 	},
487 # endif
488 	{
489 		.procname	= "bpf_jit_limit",
490 		.data		= &bpf_jit_limit,
491 		.maxlen		= sizeof(long),
492 		.mode		= 0600,
493 		.proc_handler	= proc_dolongvec_minmax_bpf_restricted,
494 		.extra1		= SYSCTL_LONG_ONE,
495 		.extra2		= &bpf_jit_limit_max,
496 	},
497 #endif
498 	{
499 		.procname	= "netdev_tstamp_prequeue",
500 		.data		= &net_hotdata.tstamp_prequeue,
501 		.maxlen		= sizeof(int),
502 		.mode		= 0644,
503 		.proc_handler	= proc_dointvec
504 	},
505 	{
506 		.procname	= "message_cost",
507 		.data		= &net_ratelimit_state.interval,
508 		.maxlen		= sizeof(int),
509 		.mode		= 0644,
510 		.proc_handler	= proc_dointvec_jiffies,
511 	},
512 	{
513 		.procname	= "message_burst",
514 		.data		= &net_ratelimit_state.burst,
515 		.maxlen		= sizeof(int),
516 		.mode		= 0644,
517 		.proc_handler	= proc_dointvec,
518 	},
519 #ifdef CONFIG_RPS
520 	{
521 		.procname	= "rps_sock_flow_entries",
522 		.maxlen		= sizeof(int),
523 		.mode		= 0644,
524 		.proc_handler	= rps_sock_flow_sysctl
525 	},
526 #endif
527 #ifdef CONFIG_NET_FLOW_LIMIT
528 	{
529 		.procname	= "flow_limit_cpu_bitmap",
530 		.mode		= 0644,
531 		.proc_handler	= flow_limit_cpu_sysctl
532 	},
533 	{
534 		.procname	= "flow_limit_table_len",
535 		.data		= &netdev_flow_limit_table_len,
536 		.maxlen		= sizeof(int),
537 		.mode		= 0644,
538 		.proc_handler	= flow_limit_table_len_sysctl
539 	},
540 #endif /* CONFIG_NET_FLOW_LIMIT */
541 #ifdef CONFIG_NET_RX_BUSY_POLL
542 	{
543 		.procname	= "busy_poll",
544 		.data		= &sysctl_net_busy_poll,
545 		.maxlen		= sizeof(unsigned int),
546 		.mode		= 0644,
547 		.proc_handler	= proc_dointvec_minmax,
548 		.extra1		= SYSCTL_ZERO,
549 	},
550 	{
551 		.procname	= "busy_read",
552 		.data		= &sysctl_net_busy_read,
553 		.maxlen		= sizeof(unsigned int),
554 		.mode		= 0644,
555 		.proc_handler	= proc_dointvec_minmax,
556 		.extra1		= SYSCTL_ZERO,
557 	},
558 #endif
559 #ifdef CONFIG_NET_SCHED
560 	{
561 		.procname	= "default_qdisc",
562 		.mode		= 0644,
563 		.maxlen		= IFNAMSIZ,
564 		.proc_handler	= set_default_qdisc
565 	},
566 #endif
567 	{
568 		.procname	= "netdev_budget",
569 		.data		= &net_hotdata.netdev_budget,
570 		.maxlen		= sizeof(int),
571 		.mode		= 0644,
572 		.proc_handler	= proc_dointvec
573 	},
574 	{
575 		.procname	= "warnings",
576 		.data		= &net_msg_warn,
577 		.maxlen		= sizeof(int),
578 		.mode		= 0644,
579 		.proc_handler	= proc_dointvec
580 	},
581 	{
582 		.procname	= "max_skb_frags",
583 		.data		= &net_hotdata.sysctl_max_skb_frags,
584 		.maxlen		= sizeof(int),
585 		.mode		= 0644,
586 		.proc_handler	= proc_dointvec_minmax,
587 		.extra1		= SYSCTL_ONE,
588 		.extra2		= &max_skb_frags,
589 	},
590 	{
591 		.procname	= "netdev_budget_usecs",
592 		.data		= &net_hotdata.netdev_budget_usecs,
593 		.maxlen		= sizeof(unsigned int),
594 		.mode		= 0644,
595 		.proc_handler	= proc_dointvec_minmax,
596 		.extra1		= &netdev_budget_usecs_min,
597 	},
598 	{
599 		.procname	= "fb_tunnels_only_for_init_net",
600 		.data		= &sysctl_fb_tunnels_only_for_init_net,
601 		.maxlen		= sizeof(int),
602 		.mode		= 0644,
603 		.proc_handler	= proc_dointvec_minmax,
604 		.extra1		= SYSCTL_ZERO,
605 		.extra2		= SYSCTL_TWO,
606 	},
607 	{
608 		.procname	= "devconf_inherit_init_net",
609 		.data		= &sysctl_devconf_inherit_init_net,
610 		.maxlen		= sizeof(int),
611 		.mode		= 0644,
612 		.proc_handler	= proc_dointvec_minmax,
613 		.extra1		= SYSCTL_ZERO,
614 		.extra2		= SYSCTL_THREE,
615 	},
616 	{
617 		.procname	= "high_order_alloc_disable",
618 		.data		= &net_high_order_alloc_disable_key.key,
619 		.maxlen         = sizeof(net_high_order_alloc_disable_key),
620 		.mode		= 0644,
621 		.proc_handler	= proc_do_static_key,
622 	},
623 	{
624 		.procname	= "gro_normal_batch",
625 		.data		= &net_hotdata.gro_normal_batch,
626 		.maxlen		= sizeof(unsigned int),
627 		.mode		= 0644,
628 		.proc_handler	= proc_dointvec_minmax,
629 		.extra1		= SYSCTL_ONE,
630 	},
631 	{
632 		.procname	= "netdev_unregister_timeout_secs",
633 		.data		= &netdev_unregister_timeout_secs,
634 		.maxlen		= sizeof(unsigned int),
635 		.mode		= 0644,
636 		.proc_handler	= proc_dointvec_minmax,
637 		.extra1		= SYSCTL_ONE,
638 		.extra2		= &int_3600,
639 	},
640 	{
641 		.procname	= "skb_defer_max",
642 		.data		= &net_hotdata.sysctl_skb_defer_max,
643 		.maxlen		= sizeof(unsigned int),
644 		.mode		= 0644,
645 		.proc_handler	= proc_dointvec_minmax,
646 		.extra1		= SYSCTL_ZERO,
647 	},
648 };
649 
650 static struct ctl_table netns_core_table[] = {
651 #if IS_ENABLED(CONFIG_RPS)
652 	{
653 		.procname	= "rps_default_mask",
654 		.data		= &init_net,
655 		.mode		= 0644,
656 		.proc_handler	= rps_default_mask_sysctl
657 	},
658 #endif
659 	{
660 		.procname	= "somaxconn",
661 		.data		= &init_net.core.sysctl_somaxconn,
662 		.maxlen		= sizeof(int),
663 		.mode		= 0644,
664 		.extra1		= SYSCTL_ZERO,
665 		.proc_handler	= proc_dointvec_minmax
666 	},
667 	{
668 		.procname	= "optmem_max",
669 		.data		= &init_net.core.sysctl_optmem_max,
670 		.maxlen		= sizeof(int),
671 		.mode		= 0644,
672 		.extra1		= SYSCTL_ZERO,
673 		.proc_handler	= proc_dointvec_minmax
674 	},
675 	{
676 		.procname	= "txrehash",
677 		.data		= &init_net.core.sysctl_txrehash,
678 		.maxlen		= sizeof(u8),
679 		.mode		= 0644,
680 		.extra1		= SYSCTL_ZERO,
681 		.extra2		= SYSCTL_ONE,
682 		.proc_handler	= proc_dou8vec_minmax,
683 	},
684 	{
685 		.procname	= "txq_reselection_ms",
686 		.data		= &init_net.core.sysctl_txq_reselection,
687 		.maxlen		= sizeof(int),
688 		.mode		= 0644,
689 		.proc_handler	= proc_dointvec_ms_jiffies,
690 	},
691 	{
692 		.procname	= "tstamp_allow_data",
693 		.data		= &init_net.core.sysctl_tstamp_allow_data,
694 		.maxlen		= sizeof(u8),
695 		.mode		= 0644,
696 		.proc_handler	= proc_dou8vec_minmax,
697 		.extra1		= SYSCTL_ZERO,
698 		.extra2		= SYSCTL_ONE
699 	},
700 	{
701 		.procname	= "bypass_prot_mem",
702 		.data		= &init_net.core.sysctl_bypass_prot_mem,
703 		.maxlen		= sizeof(u8),
704 		.mode		= 0644,
705 		.proc_handler	= proc_dou8vec_minmax,
706 		.extra1		= SYSCTL_ZERO,
707 		.extra2		= SYSCTL_ONE
708 	},
709 	/* sysctl_core_net_init() will set the values after this
710 	 * to readonly in network namespaces
711 	 */
712 	{
713 		.procname	= "wmem_max",
714 		.data		= &sysctl_wmem_max,
715 		.maxlen		= sizeof(int),
716 		.mode		= 0644,
717 		.proc_handler	= proc_dointvec_minmax,
718 		.extra1		= &min_sndbuf,
719 	},
720 	{
721 		.procname	= "rmem_max",
722 		.data		= &sysctl_rmem_max,
723 		.maxlen		= sizeof(int),
724 		.mode		= 0644,
725 		.proc_handler	= proc_dointvec_minmax,
726 		.extra1		= &min_rcvbuf,
727 	},
728 	{
729 		.procname	= "wmem_default",
730 		.data		= &sysctl_wmem_default,
731 		.maxlen		= sizeof(int),
732 		.mode		= 0644,
733 		.proc_handler	= proc_dointvec_minmax,
734 		.extra1		= &min_sndbuf,
735 	},
736 	{
737 		.procname	= "rmem_default",
738 		.data		= &sysctl_rmem_default,
739 		.maxlen		= sizeof(int),
740 		.mode		= 0644,
741 		.proc_handler	= proc_dointvec_minmax,
742 		.extra1		= &min_rcvbuf,
743 	},
744 };
745 
746 static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str)
747 {
748 	/* fallback tunnels for initns only */
749 	if (!strncmp(str, "initns", 6))
750 		sysctl_fb_tunnels_only_for_init_net = 1;
751 	/* no fallback tunnels anywhere */
752 	else if (!strncmp(str, "none", 4))
753 		sysctl_fb_tunnels_only_for_init_net = 2;
754 
755 	return 1;
756 }
757 __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
758 
759 static __net_init int sysctl_core_net_init(struct net *net)
760 {
761 	size_t table_size = ARRAY_SIZE(netns_core_table);
762 	struct ctl_table *tbl;
763 
764 	tbl = netns_core_table;
765 	if (!net_eq(net, &init_net)) {
766 		int i;
767 		tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
768 		if (tbl == NULL)
769 			goto err_dup;
770 
771 		for (i = 0; i < table_size; ++i) {
772 			if (tbl[i].data == &sysctl_wmem_max)
773 				break;
774 
775 			tbl[i].data += (char *)net - (char *)&init_net;
776 		}
777 		for (; i < table_size; ++i)
778 			tbl[i].mode &= ~0222;
779 	}
780 
781 	net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size);
782 	if (net->core.sysctl_hdr == NULL)
783 		goto err_reg;
784 
785 	return 0;
786 
787 err_reg:
788 	if (tbl != netns_core_table)
789 		kfree(tbl);
790 err_dup:
791 	return -ENOMEM;
792 }
793 
794 static __net_exit void sysctl_core_net_exit(struct net *net)
795 {
796 	const struct ctl_table *tbl;
797 
798 	tbl = net->core.sysctl_hdr->ctl_table_arg;
799 	unregister_net_sysctl_table(net->core.sysctl_hdr);
800 	BUG_ON(tbl == netns_core_table);
801 #if IS_ENABLED(CONFIG_RPS)
802 	kfree(net->core.rps_default_mask);
803 #endif
804 	kfree(tbl);
805 }
806 
807 static __net_initdata struct pernet_operations sysctl_core_ops = {
808 	.init = sysctl_core_net_init,
809 	.exit = sysctl_core_net_exit,
810 };
811 
812 static __init int sysctl_core_init(void)
813 {
814 	register_net_sysctl(&init_net, "net/core", net_core_table);
815 	return register_pernet_subsys(&sysctl_core_ops);
816 }
817 
818 fs_initcall(sysctl_core_init);
819